From: Tom Stellard <thomas.stellard@amd.com>
Date: Sat, 13 Jun 2015 03:28:10 +0000 (+0000)
Subject: R600 -> AMDGPU rename
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=953c6814730951ad9a286d7991e9c8c481433d45;p=oota-llvm.git

R600 -> AMDGPU rename

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239657 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 026fe479abd..da731497997 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -176,6 +176,7 @@ set(LLVM_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
 
 set(LLVM_ALL_TARGETS
   AArch64
+  AMDGPU
   ARM
   BPF
   CppBackend
@@ -184,7 +185,6 @@ set(LLVM_ALL_TARGETS
   MSP430
   NVPTX
   PowerPC
-  R600
   Sparc
   SystemZ
   X86
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 11ba0511799..5b70fbd1bbf 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -1097,7 +1097,7 @@ if test "$llvm_cv_enable_crash_overrides" = "yes" ; then
 fi
 
 dnl List all possible targets
-ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600 BPF"
+ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ AMDGPU BPF"
 AC_SUBST(ALL_TARGETS,$ALL_TARGETS)
 
 dnl Allow specific targets to be specified for building (or not)
@@ -1132,7 +1132,8 @@ case "$enableval" in
         hexagon)  TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
         nvptx)    TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
         systemz)  TARGETS_TO_BUILD="SystemZ $TARGETS_TO_BUILD" ;;
-        r600)     TARGETS_TO_BUILD="R600 $TARGETS_TO_BUILD" ;;
+        amdgpu)  ;&
+        r600)     TARGETS_TO_BUILD="AMDGPU $TARGETS_TO_BUILD" ;;
         host) case "$llvm_cv_target_arch" in
             x86)         TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
             x86_64)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
diff --git a/configure b/configure
index 6cb9f2d9281..73fce67b058 100755
--- a/configure
+++ b/configure
@@ -5628,7 +5628,7 @@ _ACEOF
 
 fi
 
-ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600 BPF"
+ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ AMDGPU BPF"
 ALL_TARGETS=$ALL_TARGETS
 
 
@@ -5665,7 +5665,8 @@ case "$enableval" in
         hexagon)  TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
         nvptx)    TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
         systemz)  TARGETS_TO_BUILD="SystemZ $TARGETS_TO_BUILD" ;;
-        r600)     TARGETS_TO_BUILD="R600 $TARGETS_TO_BUILD" ;;
+        amdgpu)  ;&
+        r600)     TARGETS_TO_BUILD="AMDGPU $TARGETS_TO_BUILD" ;;
         host) case "$llvm_cv_target_arch" in
             x86)         TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
             x86_64)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst
new file mode 100644
index 00000000000..3cb41cebfff
--- /dev/null
+++ b/docs/AMDGPUUsage.rst
@@ -0,0 +1,94 @@
+==============================
+User Guide for AMDGPU Back-end
+==============================
+
+Introduction
+============
+
+The AMDGPU back-end provides ISA code generation for AMD GPUs, starting with
+the R600 family up until the current Volcanic Islands (GCN Gen 3).
+
+
+Assembler
+=========
+
+The assembler is currently considered experimental.
+
+For syntax examples look in test/MC/AMDGPU.
+
+Below some of the currently supported features (modulo bugs).  These
+all apply to the Southern Islands ISA, Sea Islands and Volcanic Islands
+are also supported but may be missing some instructions and have more bugs:
+
+DS Instructions
+---------------
+All DS instructions are supported.
+
+FLAT Instructions
+------------------
+These instructions are only present in the Sea Islands and Volcanic Islands
+instruction set.  All FLAT instructions are supported for these architectures
+
+MUBUF Instructions
+------------------
+All non-atomic MUBUF instructions are supported.
+
+SMRD Instructions
+-----------------
+Only the s_load_dword* SMRD instructions are supported.
+
+SOP1 Instructions
+-----------------
+All SOP1 instructions are supported.
+
+SOP2 Instructions
+-----------------
+All SOP2 instructions are supported.
+
+SOPC Instructions
+-----------------
+All SOPC instructions are supported.
+
+SOPP Instructions
+-----------------
+
+Unless otherwise mentioned, all SOPP instructions that have one or more
+operands accept integer operands only.  No verification is performed
+on the operands, so it is up to the programmer to be familiar with the
+range or acceptable values.
+
+s_waitcnt
+^^^^^^^^^
+
+s_waitcnt accepts named arguments to specify which memory counter(s) to
+wait for.
+
+.. code-block:: nasm
+
+   // Wait for all counters to be 0
+   s_waitcnt 0
+
+   // Equivalent to s_waitcnt 0.  Counter names can also be delimited by
+   // '&' or ','.
+   s_waitcnt vmcnt(0) expcnt(0) lgkcmt(0)
+
+   // Wait for vmcnt counter to be 1.
+   s_waitcnt vmcnt(1)
+
+VOP1, VOP2, VOP3, VOPC Instructions
+-----------------------------------
+
+All 32-bit and 64-bit encodings should work.
+
+The assembler will automatically detect which encoding size to use for
+VOP1, VOP2, and VOPC instructions based on the operands.  If you want to force
+a specific encoding size, you can add an _e32 (for 32-bit encoding) or
+_e64 (for 64-bit encoding) suffix to the instruction.  Most, but not all
+instructions support an explicit suffix.  These are all valid assembly
+strings:
+
+.. code-block:: nasm
+
+   v_mul_i32_i24 v1, v2, v3
+   v_mul_i32_i24_e32 v1, v2, v3
+   v_mul_i32_i24_e64 v1, v2, v3
diff --git a/docs/CompilerWriterInfo.rst b/docs/CompilerWriterInfo.rst
index 2dfdc9b142d..900ba24e230 100644
--- a/docs/CompilerWriterInfo.rst
+++ b/docs/CompilerWriterInfo.rst
@@ -68,8 +68,8 @@ Other documents, collections, notes
 * `PowerPC64 alignment of long doubles (from GCC) <http://gcc.gnu.org/ml/gcc-patches/2003-09/msg00997.html>`_
 * `Long branch stubs for powerpc64-linux (from binutils) <http://sources.redhat.com/ml/binutils/2002-04/msg00573.html>`_
 
-R600
-----
+AMDGPU
+------
 
 * `AMD R6xx shader ISA <http://developer.amd.com/wordpress/media/2012/10/R600_Instruction_Set_Architecture.pdf>`_
 * `AMD R7xx shader ISA <http://developer.amd.com/wordpress/media/2012/10/R700-Family_Instruction_Set_Architecture.pdf>`_
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 18b3c1d87cc..212fa0b5833 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -711,7 +711,7 @@ used by people developing LLVM.
 |                         | as ``LLVM_ALL_TARGETS``, and can be set to include |
 |                         | out-of-tree targets. The default value includes:   |
 |                         | ``AArch64, ARM, CppBackend, Hexagon,               |
-|                         | Mips, MSP430, NVPTX, PowerPC, R600, Sparc,         |
+|                         | Mips, MSP430, NVPTX, PowerPC, AMDGPU, Sparc,       |
 |                         | SystemZ, X86, XCore``.                             |
 +-------------------------+----------------------------------------------------+
 | LLVM_ENABLE_DOXYGEN     | Build doxygen-based documentation from the source  |
diff --git a/docs/R600Usage.rst b/docs/R600Usage.rst
deleted file mode 100644
index 9bd16f46098..00000000000
--- a/docs/R600Usage.rst
+++ /dev/null
@@ -1,94 +0,0 @@
-============================
-User Guide for R600 Back-end
-============================
-
-Introduction
-============
-
-The R600 back-end provides ISA code generation for AMD GPUs, starting with
-the R600 family up until the current Volcanic Islands (GCN Gen 3).
-
-
-Assembler
-=========
-
-The assembler is currently considered experimental.
-
-For syntax examples look in test/MC/R600.
-
-Below some of the currently supported features (modulo bugs).  These
-all apply to the Southern Islands ISA, Sea Islands and Volcanic Islands
-are also supported but may be missing some instructions and have more bugs:
-
-DS Instructions
----------------
-All DS instructions are supported.
-
-FLAT Instructions
-------------------
-These instructions are only present in the Sea Islands and Volcanic Islands
-instruction set.  All FLAT instructions are supported for these architectures
-
-MUBUF Instructions
-------------------
-All non-atomic MUBUF instructions are supported.
-
-SMRD Instructions
------------------
-Only the s_load_dword* SMRD instructions are supported.
-
-SOP1 Instructions
------------------
-All SOP1 instructions are supported.
-
-SOP2 Instructions
------------------
-All SOP2 instructions are supported.
-
-SOPC Instructions
------------------
-All SOPC instructions are supported.
-
-SOPP Instructions
------------------
-
-Unless otherwise mentioned, all SOPP instructions that have one or more
-operands accept integer operands only.  No verification is performed
-on the operands, so it is up to the programmer to be familiar with the
-range or acceptable values.
-
-s_waitcnt
-^^^^^^^^^
-
-s_waitcnt accepts named arguments to specify which memory counter(s) to
-wait for.
-
-.. code-block:: nasm
-
-   // Wait for all counters to be 0
-   s_waitcnt 0
-
-   // Equivalent to s_waitcnt 0.  Counter names can also be delimited by
-   // '&' or ','.
-   s_waitcnt vmcnt(0) expcnt(0) lgkcmt(0)
-
-   // Wait for vmcnt counter to be 1.
-   s_waitcnt vmcnt(1)
-
-VOP1, VOP2, VOP3, VOPC Instructions
------------------------------------
-
-All 32-bit and 64-bit encodings should work.
-
-The assembler will automatically detect which encoding size to use for
-VOP1, VOP2, and VOPC instructions based on the operands.  If you want to force
-a specific encoding size, you can add an _e32 (for 32-bit encoding) or
-_e64 (for 64-bit encoding) suffix to the instruction.  Most, but not all
-instructions support an explicit suffix.  These are all valid assembly
-strings:
-
-.. code-block:: nasm
-
-   v_mul_i32_i24 v1, v2, v3
-   v_mul_i32_i24_e32 v1, v2, v3
-   v_mul_i32_i24_e64 v1, v2, v3
diff --git a/docs/index.rst b/docs/index.rst
index 2cc5b8bf095..0b681180970 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -252,7 +252,7 @@ For API clients and LLVM developers.
    WritingAnLLVMPass
    HowToUseAttributes
    NVPTXUsage
-   R600Usage
+   AMDGPUUsage
    StackMaps
    InAlloca
    BigEndianNEON
@@ -338,8 +338,8 @@ For API clients and LLVM developers.
 :doc:`NVPTXUsage`
    This document describes using the NVPTX back-end to compile GPU kernels.
 
-:doc:`R600Usage`
-   This document describes how to use the R600 back-end.
+:doc:`AMDGPUUsage`
+   This document describes how to use the AMDGPU back-end.
 
 :doc:`StackMaps`
   LLVM support for mapping instruction addresses to the location of
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
new file mode 100644
index 00000000000..0a05d25189b
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -0,0 +1,148 @@
+//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H
+#define LLVM_LIB_TARGET_R600_AMDGPU_H
+
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AMDGPUInstrPrinter;
+class AMDGPUSubtarget;
+class AMDGPUTargetMachine;
+class FunctionPass;
+class MCAsmInfo;
+class raw_ostream;
+class Target;
+class TargetMachine;
+
+// R600 Passes
+FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
+FunctionPass *createR600TextureIntrinsicsReplacer();
+FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
+FunctionPass *createR600EmitClauseMarkers();
+FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
+FunctionPass *createR600Packetizer(TargetMachine &tm);
+FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
+FunctionPass *createAMDGPUCFGStructurizerPass();
+
+// SI Passes
+FunctionPass *createSITypeRewriter();
+FunctionPass *createSIAnnotateControlFlowPass();
+FunctionPass *createSIFoldOperandsPass();
+FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createSIShrinkInstructionsPass();
+FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
+FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
+FunctionPass *createSIFixControlFlowLiveIntervalsPass();
+FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
+FunctionPass *createSIFixSGPRLiveRangesPass();
+FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
+FunctionPass *createSIInsertWaits(TargetMachine &tm);
+FunctionPass *createSIPrepareScratchRegs();
+
+void initializeSIFoldOperandsPass(PassRegistry &);
+extern char &SIFoldOperandsID;
+
+void initializeSILowerI1CopiesPass(PassRegistry &);
+extern char &SILowerI1CopiesID;
+
+void initializeSILoadStoreOptimizerPass(PassRegistry &);
+extern char &SILoadStoreOptimizerID;
+
+// Passes common to R600 and SI
+FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
+Pass *createAMDGPUStructurizeCFGPass();
+FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
+ModulePass *createAMDGPUAlwaysInlinePass();
+
+void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
+extern char &SIFixControlFlowLiveIntervalsID;
+
+void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
+extern char &SIFixSGPRLiveRangesID;
+
+
+extern Target TheAMDGPUTarget;
+extern Target TheGCNTarget;
+
+namespace AMDGPU {
+enum TargetIndex {
+  TI_CONSTDATA_START,
+  TI_SCRATCH_RSRC_DWORD0,
+  TI_SCRATCH_RSRC_DWORD1,
+  TI_SCRATCH_RSRC_DWORD2,
+  TI_SCRATCH_RSRC_DWORD3
+};
+}
+
+#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel"
+
+} // End namespace llvm
+
+namespace ShaderType {
+  enum Type {
+    PIXEL = 0,
+    VERTEX = 1,
+    GEOMETRY = 2,
+    COMPUTE = 3
+  };
+}
+
+/// OpenCL uses address spaces to differentiate between
+/// various memory regions on the hardware. On the CPU
+/// all of the address spaces point to the same memory,
+/// however on the GPU, each address space points to
+/// a separate piece of memory that is unique from other
+/// memory locations.
+namespace AMDGPUAS {
+enum AddressSpaces : unsigned {
+  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
+  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
+  CONSTANT_ADDRESS = 2, ///< Address space for constant memory
+  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
+  FLAT_ADDRESS     = 4, ///< Address space for flat memory.
+  REGION_ADDRESS   = 5, ///< Address space for region memory.
+  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
+  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
+
+  // Do not re-order the CONSTANT_BUFFER_* enums.  Several places depend on this
+  // order to be able to dynamically index a constant buffer, for example:
+  //
+  // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
+
+  CONSTANT_BUFFER_0 = 8,
+  CONSTANT_BUFFER_1 = 9,
+  CONSTANT_BUFFER_2 = 10,
+  CONSTANT_BUFFER_3 = 11,
+  CONSTANT_BUFFER_4 = 12,
+  CONSTANT_BUFFER_5 = 13,
+  CONSTANT_BUFFER_6 = 14,
+  CONSTANT_BUFFER_7 = 15,
+  CONSTANT_BUFFER_8 = 16,
+  CONSTANT_BUFFER_9 = 17,
+  CONSTANT_BUFFER_10 = 18,
+  CONSTANT_BUFFER_11 = 19,
+  CONSTANT_BUFFER_12 = 20,
+  CONSTANT_BUFFER_13 = 21,
+  CONSTANT_BUFFER_14 = 22,
+  CONSTANT_BUFFER_15 = 23,
+  ADDRESS_NONE = 24, ///< Address space for unknown memory.
+  LAST_ADDRESS = ADDRESS_NONE,
+
+  // Some places use this if the address space can't be determined.
+  UNKNOWN_ADDRESS_SPACE = ~0u
+};
+
+} // namespace AMDGPUAS
+
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
new file mode 100644
index 00000000000..2e7e39a54d3
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -0,0 +1,266 @@
+//===-- AMDGPU.td - AMDGPU Tablegen files ------------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features
+//===----------------------------------------------------------------------===//
+
+// Debugging Features
+
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
+        "DumpCode",
+        "true",
+        "Dump MachineInstrs in the CodeEmitter">;
+
+def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
+        "DumpCode",
+        "true",
+        "Dump MachineInstrs in the CodeEmitter">;
+
+def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer",
+        "EnableIRStructurizer",
+        "false",
+        "Disable IR Structurizer">;
+
+def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
+        "EnablePromoteAlloca",
+        "true",
+        "Enable promote alloca pass">;
+
+// Target features
+
+def FeatureIfCvt : SubtargetFeature <"disable-ifcvt",
+        "EnableIfCvt",
+        "false",
+        "Disable the if conversion pass">;
+
+def FeatureFP64 : SubtargetFeature<"fp64",
+        "FP64",
+        "true",
+        "Enable double precision operations">;
+
+def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
+        "FP64Denormals",
+        "true",
+        "Enable double precision denormal handling",
+        [FeatureFP64]>;
+
+def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
+        "FastFMAF32",
+        "true",
+        "Assuming f32 fma is at least as fast as mul + add",
+        []>;
+
+// Some instructions do not support denormals despite this flag. Using
+// fp32 denormals also causes instructions to run at the double
+// precision rate for the device.
+def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
+        "FP32Denormals",
+        "true",
+        "Enable single precision denormal handling">;
+
+def Feature64BitPtr : SubtargetFeature<"64BitPtr",
+        "Is64bit",
+        "true",
+        "Specify if 64-bit addressing should be used">;
+
+def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
+        "R600ALUInst",
+        "false",
+        "Older version of ALU instructions encoding">;
+
+def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
+        "HasVertexCache",
+        "true",
+        "Specify use of dedicated vertex cache">;
+
+def FeatureCaymanISA : SubtargetFeature<"caymanISA",
+        "CaymanISA",
+        "true",
+        "Use Cayman ISA">;
+
+def FeatureCFALUBug : SubtargetFeature<"cfalubug",
+        "CFALUBug",
+        "true",
+        "GPU has CF_ALU bug">;
+
+// XXX - This should probably be removed once enabled by default
+def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
+        "EnableLoadStoreOpt",
+        "true",
+        "Enable SI load/store optimizer pass">;
+
+def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
+        "FlatAddressSpace",
+        "true",
+        "Support flat address space">;
+
+def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
+        "EnableVGPRSpilling",
+        "true",
+        "Enable spilling of VGPRs to scratch memory">;
+
+def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
+        "SGPRInitBug",
+        "true",
+        "VI SGPR initilization bug requiring a fixed SGPR allocation size">;
+
+class SubtargetFeatureFetchLimit <string Value> :
+                          SubtargetFeature <"fetch"#Value,
+        "TexVTXClauseSize",
+        Value,
+        "Limit the maximum number of fetches in a clause to "#Value>;
+
+def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
+def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
+
+class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
+        "wavefrontsize"#Value,
+        "WavefrontSize",
+        !cast<string>(Value),
+        "The number of threads per wavefront">;
+
+def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
+def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
+def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+
+class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
+      "ldsbankcount"#Value,
+      "LDSBankCount",
+      !cast<string>(Value),
+      "The number of LDS banks per compute unit.">;
+
+def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
+def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
+
+class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
+        "localmemorysize"#Value,
+        "LocalMemorySize",
+        !cast<string>(Value),
+        "The size of local memory in bytes">;
+
+def FeatureGCN : SubtargetFeature<"gcn",
+        "IsGCN",
+        "true",
+        "GCN or newer GPU">;
+
+def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding",
+        "GCN1Encoding",
+        "true",
+        "Encoding format for SI and CI">;
+
+def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
+        "GCN3Encoding",
+        "true",
+        "Encoding format for VI">;
+
+def FeatureCIInsts : SubtargetFeature<"ci-insts",
+        "CIInsts",
+        "true",
+        "Additional intstructions for CI+">;
+
+// Dummy feature used to disable assembler instructions.
+def FeatureDisable : SubtargetFeature<"",
+                                      "FeatureDisable","true",
+                                      "Dummy feature to disable assembler"
+                                      " instructions">;
+
+class SubtargetFeatureGeneration <string Value,
+                                  list<SubtargetFeature> Implies> :
+        SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
+                          Value#" GPU generation", Implies>;
+
+def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
+def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
+def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
+
+def FeatureR600 : SubtargetFeatureGeneration<"R600",
+        [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>;
+
+def FeatureR700 : SubtargetFeatureGeneration<"R700",
+        [FeatureFetchLimit16, FeatureLocalMemorySize0]>;
+
+def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
+        [FeatureFetchLimit16, FeatureLocalMemorySize32768]>;
+
+def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
+        [FeatureFetchLimit16, FeatureWavefrontSize64,
+         FeatureLocalMemorySize32768]
+>;
+
+def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
+        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768,
+         FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding,
+         FeatureLDSBankCount32]>;
+
+def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
+        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
+         FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
+         FeatureGCN1Encoding, FeatureCIInsts]>;
+
+def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
+        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
+         FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
+         FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>;
+
+//===----------------------------------------------------------------------===//
+
+def AMDGPUInstrInfo : InstrInfo {
+  let guessInstructionProperties = 1;
+  let noNamedPositionallyEncodedOperands = 1;
+}
+
+def AMDGPUAsmParser : AsmParser {
+  // Some of the R600 registers have the same name, so this crashes.
+  // For example T0_XYZW and T0_XY both have the asm name T0.
+  let ShouldEmitMatchRegisterName = 0;
+}
+
+def AMDGPU : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = AMDGPUInstrInfo;
+  let AssemblyParsers = [AMDGPUAsmParser];
+}
+
+// Dummy Instruction itineraries for pseudo instructions
+def ALU_NULL : FuncUnit;
+def NullALU : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Predicate helper class
+//===----------------------------------------------------------------------===//
+
+def TruePredicate : Predicate<"true">;
+def isSICI : Predicate<
+  "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
+>, AssemblerPredicate<"FeatureGCN1Encoding">;
+
+class PredicateControl {
+  Predicate SubtargetPredicate;
+  Predicate SIAssemblerPredicate = isSICI;
+  list<Predicate> AssemblerPredicates = [];
+  Predicate AssemblerPredicate = TruePredicate;
+  list<Predicate> OtherPredicates = [];
+  list<Predicate> Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate],
+                                            AssemblerPredicates,
+                                            OtherPredicates);
+}
+
+// Include AMDGPU TD files
+include "R600Schedule.td"
+include "SISchedule.td"
+include "Processors.td"
+include "AMDGPUInstrInfo.td"
+include "AMDGPUIntrinsics.td"
+include "AMDGPURegisterInfo.td"
+include "AMDGPUInstructions.td"
+include "AMDGPUCallingConv.td"
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
new file mode 100644
index 00000000000..0b426bc63dd
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -0,0 +1,67 @@
+//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass marks all internal functions as always_inline and creates
+/// duplicates of all other functions a marks the duplicates as always_inline.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAlwaysInline : public ModulePass {
+
+  static char ID;
+
+public:
+  AMDGPUAlwaysInline() : ModulePass(ID) { }
+  bool runOnModule(Module &M) override;
+  const char *getPassName() const override { return "AMDGPU Always Inline Pass"; }
+};
+
+} // End anonymous namespace
+
+char AMDGPUAlwaysInline::ID = 0;
+
+bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+
+  std::vector<Function*> FuncsToClone;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    Function &F = *I;
+    if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
+        !F.hasFnAttribute(Attribute::NoInline))
+      FuncsToClone.push_back(&F);
+  }
+
+  for (Function *F : FuncsToClone) {
+    ValueToValueMapTy VMap;
+    Function *NewFunc = CloneFunction(F, VMap, false);
+    NewFunc->setLinkage(GlobalValue::InternalLinkage);
+    F->getParent()->getFunctionList().push_back(NewFunc);
+    F->replaceAllUsesWith(NewFunc);
+  }
+
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    Function &F = *I;
+    if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) {
+      F.addFnAttr(Attribute::AlwaysInline);
+    }
+  }
+  return false;
+}
+
+ModulePass *llvm::createAMDGPUAlwaysInlinePass() {
+  return new AMDGPUAlwaysInline();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
new file mode 100644
index 00000000000..29c2da61add
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -0,0 +1,600 @@
+//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
+/// code.  When passed an MCAsmStreamer it prints assembly and when passed
+/// an MCObjectStreamer it outputs binary code.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPUAsmPrinter.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "AMDGPU.h"
+#include "AMDKernelCodeT.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "SIDefines.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+// TODO: This should get the default rounding mode from the kernel. We just set
+// the default here, but this could change if the OpenCL rounding mode pragmas
+// are used.
+//
+// The denormal mode here should match what is reported by the OpenCL runtime
+// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
+// can also be override to flush with the -cl-denorms-are-zero compiler flag.
+//
+// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
+// precision, and leaves single precision to flush all and does not report
+// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
+// CL_FP_DENORM for both.
+//
+// FIXME: It seems some instructions do not support single precision denormals
+// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
+// and sin_f32, cos_f32 on most parts).
+
+// We want to use these instructions, and using fp32 denormals also causes
+// instructions to run at the double precision rate for the device so it's
+// probably best to just report no single precision denormals.
+static uint32_t getFPMode(const MachineFunction &F) {
+  const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>();
+  // TODO: Is there any real use for the flush in only / flush out only modes?
+
+  uint32_t FP32Denormals =
+    ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+  uint32_t FP64Denormals =
+    ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+  return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
+         FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
+         FP_DENORM_MODE_SP(FP32Denormals) |
+         FP_DENORM_MODE_DP(FP64Denormals);
+}
+
+static AsmPrinter *
+createAMDGPUAsmPrinterPass(TargetMachine &tm,
+                           std::unique_ptr<MCStreamer> &&Streamer) {
+  return new AMDGPUAsmPrinter(tm, std::move(Streamer));
+}
+
+extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
+  TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
+  TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass);
+}
+
+AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
+                                   std::unique_ptr<MCStreamer> Streamer)
+    : AsmPrinter(TM, std::move(Streamer)) {}
+
+void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
+
+  // This label is used to mark the end of the .text section.
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+  OutStreamer->SwitchSection(TLOF.getTextSection());
+  MCSymbol *EndOfTextLabel =
+      OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
+  OutStreamer->EmitLabel(EndOfTextLabel);
+}
+
+bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+
+  // The starting address of all shader programs must be 256 bytes aligned.
+  MF.setAlignment(8);
+
+  SetupMachineFunction(MF);
+
+  MCContext &Context = getObjFileLowering().getContext();
+  MCSectionELF *ConfigSection =
+      Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
+  OutStreamer->SwitchSection(ConfigSection);
+
+  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  SIProgramInfo KernelInfo;
+  if (STM.isAmdHsaOS()) {
+    getSIProgramInfo(KernelInfo, MF);
+    EmitAmdKernelCodeT(MF, KernelInfo);
+    OutStreamer->EmitCodeAlignment(2 << (MF.getAlignment() - 1));
+  } else if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    getSIProgramInfo(KernelInfo, MF);
+    EmitProgramInfoSI(MF, KernelInfo);
+  } else {
+    EmitProgramInfoR600(MF);
+  }
+
+  DisasmLines.clear();
+  HexLines.clear();
+  DisasmLineMaxLen = 0;
+
+  EmitFunctionBody();
+
+  if (isVerbose()) {
+    MCSectionELF *CommentSection =
+        Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
+    OutStreamer->SwitchSection(CommentSection);
+
+    if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      OutStreamer->emitRawComment(" Kernel info:", false);
+      OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
+                                  false);
+      OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
+                                  false);
+      OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
+                                  false);
+      OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
+                                  false);
+      OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
+                                  false);
+      OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
+                                  false);
+    } else {
+      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+      OutStreamer->emitRawComment(
+        Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
+    }
+  }
+
+  if (STM.dumpCode()) {
+
+    OutStreamer->SwitchSection(
+        Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
+
+    for (size_t i = 0; i < DisasmLines.size(); ++i) {
+      std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
+      Comment += " ; " + HexLines[i] + "\n";
+
+      OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
+      OutStreamer->EmitBytes(StringRef(Comment));
+    }
+  }
+
+  return false;
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
+  unsigned MaxGPR = 0;
+  bool killPixel = false;
+  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  const R600RegisterInfo *RI =
+      static_cast<const R600RegisterInfo *>(STM.getRegisterInfo());
+  const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      if (MI.getOpcode() == AMDGPU::KILLGT)
+        killPixel = true;
+      unsigned numOperands = MI.getNumOperands();
+      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+        const MachineOperand &MO = MI.getOperand(op_idx);
+        if (!MO.isReg())
+          continue;
+        unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
+
+        // Register with value > 127 aren't GPR
+        if (HWReg > 127)
+          continue;
+        MaxGPR = std::max(MaxGPR, HWReg);
+      }
+    }
+  }
+
+  unsigned RsrcReg;
+  if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
+    // Evergreen / Northern Islands
+    switch (MFI->getShaderType()) {
+    default: // Fall through
+    case ShaderType::COMPUTE:  RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
+    case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
+    case ShaderType::PIXEL:    RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
+    case ShaderType::VERTEX:   RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
+    }
+  } else {
+    // R600 / R700
+    switch (MFI->getShaderType()) {
+    default: // Fall through
+    case ShaderType::GEOMETRY: // Fall through
+    case ShaderType::COMPUTE:  // Fall through
+    case ShaderType::VERTEX:   RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
+    case ShaderType::PIXEL:    RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
+    }
+  }
+
+  OutStreamer->EmitIntValue(RsrcReg, 4);
+  OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
+                           S_STACK_SIZE(MFI->StackSize), 4);
+  OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
+  OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
+
+  if (MFI->getShaderType() == ShaderType::COMPUTE) {
+    OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
+    OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
+  }
+}
+
+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
+                                        const MachineFunction &MF) const {
+  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  uint64_t CodeSize = 0;
+  unsigned MaxSGPR = 0;
+  unsigned MaxVGPR = 0;
+  bool VCCUsed = false;
+  bool FlatUsed = false;
+  const SIRegisterInfo *RI =
+      static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
+
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      // TODO: CodeSize should account for multiple functions.
+      CodeSize += MI.getDesc().Size;
+
+      unsigned numOperands = MI.getNumOperands();
+      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+        const MachineOperand &MO = MI.getOperand(op_idx);
+        unsigned width = 0;
+        bool isSGPR = false;
+
+        if (!MO.isReg()) {
+          continue;
+        }
+        unsigned reg = MO.getReg();
+        if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO ||
+	    reg == AMDGPU::VCC_HI) {
+          VCCUsed = true;
+          continue;
+        } else if (reg == AMDGPU::FLAT_SCR ||
+                   reg == AMDGPU::FLAT_SCR_LO ||
+                   reg == AMDGPU::FLAT_SCR_HI) {
+          FlatUsed = true;
+          continue;
+        }
+
+        switch (reg) {
+        default: break;
+        case AMDGPU::SCC:
+        case AMDGPU::EXEC:
+        case AMDGPU::M0:
+          continue;
+        }
+
+        if (AMDGPU::SReg_32RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 1;
+        } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 1;
+        } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 2;
+        } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 2;
+        } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 3;
+        } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 4;
+        } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 4;
+        } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 8;
+        } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 8;
+        } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 16;
+        } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 16;
+        } else {
+          llvm_unreachable("Unknown register class");
+        }
+        unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
+        unsigned maxUsed = hwReg + width - 1;
+        if (isSGPR) {
+          MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
+        } else {
+          MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
+        }
+      }
+    }
+  }
+
+  if (VCCUsed)
+    MaxSGPR += 2;
+
+  if (FlatUsed)
+    MaxSGPR += 2;
+
+  // We found the maximum register index. They start at 0, so add one to get the
+  // number of registers.
+  ProgInfo.NumVGPR = MaxVGPR + 1;
+  ProgInfo.NumSGPR = MaxSGPR + 1;
+
+  if (STM.hasSGPRInitBug()) {
+    if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG)
+      llvm_unreachable("Too many SGPRs used with the SGPR init bug");
+
+    ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+  }
+
+  ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
+  ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
+  // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
+  // register.
+  ProgInfo.FloatMode = getFPMode(MF);
+
+  // XXX: Not quite sure what this does, but sc seems to unset this.
+  ProgInfo.IEEEMode = 0;
+
+  // Do not clamp NAN to 0.
+  ProgInfo.DX10Clamp = 0;
+
+  const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+  ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
+
+  ProgInfo.FlatUsed = FlatUsed;
+  ProgInfo.VCCUsed = VCCUsed;
+  ProgInfo.CodeLen = CodeSize;
+
+  unsigned LDSAlignShift;
+  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+    // LDS is allocated in 64 dword blocks.
+    LDSAlignShift = 8;
+  } else {
+    // LDS is allocated in 128 dword blocks.
+    LDSAlignShift = 9;
+  }
+
+  unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
+                          MFI->getMaximumWorkGroupSize(MF);
+
+  ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
+  ProgInfo.LDSBlocks =
+     RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+
+  // Scratch is allocated in 256 dword blocks.
+  unsigned ScratchAlignShift = 10;
+  // We need to program the hardware with the amount of scratch memory that
+  // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
+  // scratch memory used per thread.
+  ProgInfo.ScratchBlocks =
+    RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(),
+                       1 << ScratchAlignShift) >> ScratchAlignShift;
+
+  ProgInfo.ComputePGMRSrc1 =
+      S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
+      S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
+      S_00B848_PRIORITY(ProgInfo.Priority) |
+      S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+      S_00B848_PRIV(ProgInfo.Priv) |
+      S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
+      S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
+      S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
+
+  ProgInfo.ComputePGMRSrc2 =
+      S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
+      S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
+      S_00B84C_TGID_X_EN(1) |
+      S_00B84C_TGID_Y_EN(1) |
+      S_00B84C_TGID_Z_EN(1) |
+      S_00B84C_TG_SIZE_EN(1) |
+      S_00B84C_TIDIG_COMP_CNT(2) |
+      S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
+}
+
+static unsigned getRsrcReg(unsigned ShaderType) {
+  switch (ShaderType) {
+  default: // Fall through
+  case ShaderType::COMPUTE:  return R_00B848_COMPUTE_PGM_RSRC1;
+  case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
+  case ShaderType::PIXEL:    return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
+  case ShaderType::VERTEX:   return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
+  }
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
+                                         const SIProgramInfo &KernelInfo) {
+  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned RsrcReg = getRsrcReg(MFI->getShaderType());
+
+  if (MFI->getShaderType() == ShaderType::COMPUTE) {
+    OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
+
+    OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
+
+    OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
+    OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
+
+    OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
+    OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+
+    // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
+    // 0" comment but I don't see a corresponding field in the register spec.
+  } else {
+    OutStreamer->EmitIntValue(RsrcReg, 4);
+    OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
+                              S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
+    if (STM.isVGPRSpillingEnabled(MFI)) {
+      OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
+      OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+    }
+  }
+
+  if (MFI->getShaderType() == ShaderType::PIXEL) {
+    OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
+    OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
+    OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
+    OutStreamer->EmitIntValue(MFI->PSInputAddr, 4);
+  }
+}
+
+void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
+                                        const SIProgramInfo &KernelInfo) const {
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  amd_kernel_code_t header;
+
+  memset(&header, 0, sizeof(header));
+
+  header.amd_code_version_major = AMD_CODE_VERSION_MAJOR;
+  header.amd_code_version_minor = AMD_CODE_VERSION_MINOR;
+
+  header.struct_byte_size = sizeof(amd_kernel_code_t);
+
+  header.target_chip = STM.getAmdKernelCodeChipID();
+
+  header.kernel_code_entry_byte_offset = (1ULL << MF.getAlignment());
+
+  header.compute_pgm_resource_registers =
+      KernelInfo.ComputePGMRSrc1 |
+      (KernelInfo.ComputePGMRSrc2 << 32);
+
+  // Code Properties:
+  header.code_properties = AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
+                           AMD_CODE_PROPERTY_IS_PTR64;
+
+  if (KernelInfo.FlatUsed)
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+
+  if (KernelInfo.ScratchBlocks)
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
+
+  header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
+  header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
+
+  // MFI->ABIArgOffset is the number of bytes for the kernel arguments
+  // plus 36.  36 is the number of bytes reserved at the begining of the
+  // input buffer to store work-group size information.
+  // FIXME: We should be adding the size of the implicit arguments
+  // to this value.
+  header.kernarg_segment_byte_size = MFI->ABIArgOffset;
+
+  header.wavefront_sgpr_count = KernelInfo.NumSGPR;
+  header.workitem_vgpr_count = KernelInfo.NumVGPR;
+
+  // FIXME: What values do I put for these alignments
+  header.kernarg_segment_alignment = 0;
+  header.group_segment_alignment = 0;
+  header.private_segment_alignment = 0;
+
+  header.code_type = 1; // HSA_EXT_CODE_KERNEL
+
+  header.wavefront_size = STM.getWavefrontSize();
+
+  MCSectionELF *VersionSection =
+      OutContext.getELFSection(".hsa.version", ELF::SHT_PROGBITS, 0);
+  OutStreamer->SwitchSection(VersionSection);
+  OutStreamer->EmitBytes(Twine("HSA Code Unit:" +
+                         Twine(header.hsail_version_major) + "." +
+                         Twine(header.hsail_version_minor) + ":" +
+                         "AMD:" +
+                         Twine(header.amd_code_version_major) + "." +
+                         Twine(header.amd_code_version_minor) +  ":" +
+                         "GFX8.1:0").str());
+
+  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+
+  if (isVerbose()) {
+    OutStreamer->emitRawComment("amd_code_version_major = " +
+                                Twine(header.amd_code_version_major), false);
+    OutStreamer->emitRawComment("amd_code_version_minor = " +
+                                Twine(header.amd_code_version_minor), false);
+    OutStreamer->emitRawComment("struct_byte_size = " +
+                                Twine(header.struct_byte_size), false);
+    OutStreamer->emitRawComment("target_chip = " +
+                                Twine(header.target_chip), false);
+    OutStreamer->emitRawComment(" compute_pgm_rsrc1: " +
+                                Twine::utohexstr(KernelInfo.ComputePGMRSrc1),
+                                false);
+    OutStreamer->emitRawComment(" compute_pgm_rsrc2: " +
+                                Twine::utohexstr(KernelInfo.ComputePGMRSrc2),
+                                false);
+    OutStreamer->emitRawComment("enable_sgpr_private_segment_buffer = " +
+      Twine((bool)(header.code_properties &
+                   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)), false);
+    OutStreamer->emitRawComment("enable_sgpr_kernarg_segment_ptr = " +
+      Twine((bool)(header.code_properties &
+                   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)), false);
+    OutStreamer->emitRawComment("private_element_size = 2 ", false);
+    OutStreamer->emitRawComment("is_ptr64 = " +
+        Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_IS_PTR64)), false);
+    OutStreamer->emitRawComment("workitem_private_segment_byte_size = " +
+                                Twine(header.workitem_private_segment_byte_size),
+                                false);
+    OutStreamer->emitRawComment("workgroup_group_segment_byte_size = " +
+                                Twine(header.workgroup_group_segment_byte_size),
+                                false);
+    OutStreamer->emitRawComment("gds_segment_byte_size = " +
+                                Twine(header.gds_segment_byte_size), false);
+    OutStreamer->emitRawComment("kernarg_segment_byte_size = " +
+                                Twine(header.kernarg_segment_byte_size), false);
+    OutStreamer->emitRawComment("wavefront_sgpr_count = " +
+                                Twine(header.wavefront_sgpr_count), false);
+    OutStreamer->emitRawComment("workitem_vgpr_count = " +
+                                Twine(header.workitem_vgpr_count), false);
+    OutStreamer->emitRawComment("code_type = " + Twine(header.code_type), false);
+    OutStreamer->emitRawComment("wavefront_size = " +
+                                Twine((int)header.wavefront_size), false);
+    OutStreamer->emitRawComment("optimization_level = " +
+                                Twine(header.optimization_level), false);
+    OutStreamer->emitRawComment("hsail_profile = " +
+                                Twine(header.hsail_profile), false);
+    OutStreamer->emitRawComment("hsail_machine_model = " +
+                                Twine(header.hsail_machine_model), false);
+    OutStreamer->emitRawComment("hsail_version_major = " +
+                                Twine(header.hsail_version_major), false);
+    OutStreamer->emitRawComment("hsail_version_minor = " +
+                                Twine(header.hsail_version_minor), false);
+  }
+
+  OutStreamer->EmitBytes(StringRef((char*)&header, sizeof(header)));
+}
+
+bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                       unsigned AsmVariant,
+                                       const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'r':
+      break;
+    }
+  }
+
+  AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O,
+                   *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
+  return false;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
new file mode 100644
index 00000000000..1acff3a3222
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -0,0 +1,113 @@
+//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
+#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include <vector>
+
+namespace llvm {
+
+class AMDGPUAsmPrinter : public AsmPrinter {
+private:
+  struct SIProgramInfo {
+    SIProgramInfo() :
+      VGPRBlocks(0),
+      SGPRBlocks(0),
+      Priority(0),
+      FloatMode(0),
+      Priv(0),
+      DX10Clamp(0),
+      DebugMode(0),
+      IEEEMode(0),
+      ScratchSize(0),
+      ComputePGMRSrc1(0),
+      LDSBlocks(0),
+      ScratchBlocks(0),
+      ComputePGMRSrc2(0),
+      NumVGPR(0),
+      NumSGPR(0),
+      FlatUsed(false),
+      VCCUsed(false),
+      CodeLen(0) {}
+
+    // Fields set in PGM_RSRC1 pm4 packet.
+    uint32_t VGPRBlocks;
+    uint32_t SGPRBlocks;
+    uint32_t Priority;
+    uint32_t FloatMode;
+    uint32_t Priv;
+    uint32_t DX10Clamp;
+    uint32_t DebugMode;
+    uint32_t IEEEMode;
+    uint32_t ScratchSize;
+
+    uint64_t ComputePGMRSrc1;
+
+    // Fields set in PGM_RSRC2 pm4 packet.
+    uint32_t LDSBlocks;
+    uint32_t ScratchBlocks;
+
+    uint64_t ComputePGMRSrc2;
+
+    uint32_t NumVGPR;
+    uint32_t NumSGPR;
+    uint32_t LDSSize;
+    bool FlatUsed;
+
+    // Bonus information for debugging.
+    bool VCCUsed;
+    uint64_t CodeLen;
+  };
+
+  void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
+  void findNumUsedRegistersSI(const MachineFunction &MF,
+                              unsigned &NumSGPR,
+                              unsigned &NumVGPR) const;
+
+  /// \brief Emit register usage information so that the GPU driver
+  /// can correctly setup the GPU state.
+  void EmitProgramInfoR600(const MachineFunction &MF);
+  void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
+  void EmitAmdKernelCodeT(const MachineFunction &MF,
+                          const SIProgramInfo &KernelInfo) const;
+
+public:
+  explicit AMDGPUAsmPrinter(TargetMachine &TM,
+                            std::unique_ptr<MCStreamer> Streamer);
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "AMDGPU Assembly Printer";
+  }
+
+  /// Implemented in AMDGPUMCInstLower.cpp
+  void EmitInstruction(const MachineInstr *MI) override;
+
+  void EmitEndOfAsmFile(Module &M) override;
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+
+protected:
+  std::vector<std::string> DisasmLines, HexLines;
+  size_t DisasmLineMaxLen;
+};
+
+} // End anonymous llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
new file mode 100644
index 00000000000..6ffa7a08358
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -0,0 +1,82 @@
+//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the AMD Radeon GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+// Inversion of CCIfInReg
+class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
+
+// Calling convention for SI
+def CC_SI : CallingConv<[
+
+  CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+    SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21
+  ]>>>,
+
+  CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+  >>>,
+
+  CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
+  ]>>>,
+
+  CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+  >>>
+
+]>;
+
+// Calling convention for R600
+def CC_R600 : CallingConv<[
+  CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
+    T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
+    T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
+    T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
+    T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
+    T30_XYZW, T31_XYZW, T32_XYZW
+  ]>>>
+]>;
+
+// Calling convention for compute kernels
+def CC_AMDGPU_Kernel : CallingConv<[
+  CCCustom<"allocateStack">
+]>;
+
+def CC_AMDGPU : CallingConv<[
+  CCIf<"static_cast<const AMDGPUSubtarget&>"
+        "(State.getMachineFunction().getSubtarget()).getGeneration() >="
+          "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+        "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()"
+         "->getShaderType() == ShaderType::COMPUTE",
+       CCDelegateTo<CC_AMDGPU_Kernel>>,
+  CCIf<"static_cast<const AMDGPUSubtarget&>"
+        "(State.getMachineFunction().getSubtarget()).getGeneration() < "
+          "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+         "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()"
+          "->getShaderType() == ShaderType::COMPUTE",
+        CCDelegateTo<CC_AMDGPU_Kernel>>,
+   CCIf<"static_cast<const AMDGPUSubtarget&>"
+         "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
+           "AMDGPUSubtarget::SOUTHERN_ISLANDS",
+        CCDelegateTo<CC_SI>>,
+   CCIf<"static_cast<const AMDGPUSubtarget&>"
+          "(State.getMachineFunction().getSubtarget()).getGeneration() < "
+            "AMDGPUSubtarget::SOUTHERN_ISLANDS",
+        CCDelegateTo<CC_R600>>
+]>;
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
new file mode 100644
index 00000000000..8175786fb9b
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -0,0 +1,112 @@
+//===----------------------- AMDGPUFrameLowering.cpp ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface to describe a layout of a stack frame on a AMDIL target machine
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPURegisterInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
+    int LAO, unsigned TransAl)
+  : TargetFrameLowering(D, StackAl, LAO, TransAl) { }
+
+AMDGPUFrameLowering::~AMDGPUFrameLowering() { }
+
+unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
+
+  // XXX: Hardcoding to 1 for now.
+  //
+  // I think the StackWidth should stored as metadata associated with the
+  // MachineFunction.  This metadata can either be added by a frontend, or
+  // calculated by a R600 specific LLVM IR pass.
+  //
+  // The StackWidth determines how stack objects are laid out in memory.
+  // For a vector stack variable, like: int4 stack[2], the data will be stored
+  // in the following ways depending on the StackWidth.
+  //
+  // StackWidth = 1:
+  //
+  // T0.X = stack[0].x
+  // T1.X = stack[0].y
+  // T2.X = stack[0].z
+  // T3.X = stack[0].w
+  // T4.X = stack[1].x
+  // T5.X = stack[1].y
+  // T6.X = stack[1].z
+  // T7.X = stack[1].w
+  //
+  // StackWidth = 2:
+  //
+  // T0.X = stack[0].x
+  // T0.Y = stack[0].y
+  // T1.X = stack[0].z
+  // T1.Y = stack[0].w
+  // T2.X = stack[1].x
+  // T2.Y = stack[1].y
+  // T3.X = stack[1].z
+  // T3.Y = stack[1].w
+  // 
+  // StackWidth = 4:
+  // T0.X = stack[0].x
+  // T0.Y = stack[0].y
+  // T0.Z = stack[0].z
+  // T0.W = stack[0].w
+  // T1.X = stack[1].x
+  // T1.Y = stack[1].y
+  // T1.Z = stack[1].z
+  // T1.W = stack[1].w
+  return 1;
+}
+
+/// \returns The number of registers allocated for \p FI.
+int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                         int FI) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Start the offset at 2 so we don't overwrite work group information.
+  // XXX: We should only do this when the shader actually uses this
+  // information.
+  unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
+  int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
+
+  for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
+    OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i));
+    OffsetBytes += MFI->getObjectSize(i);
+    // Each register holds 4 bytes, so we must always align the offset to at
+    // least 4 bytes, so that 2 frame objects won't share the same register.
+    OffsetBytes = RoundUpToAlignment(OffsetBytes, 4);
+  }
+
+  if (FI != -1)
+    OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI));
+
+  return OffsetBytes / (getStackWidth(MF) * 4);
+}
+
+const TargetFrameLowering::SpillSlot *
+AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+  NumEntries = 0;
+  return nullptr;
+}
+void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {}
+void
+AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF,
+                                  MachineBasicBlock &MBB) const {
+}
+
+bool
+AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const {
+  return false;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
new file mode 100644
index 00000000000..9f31be1af79
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -0,0 +1,45 @@
+//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface to describe a layout of a stack frame on a AMDIL target
+/// machine.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
+#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+/// \brief Information about the stack frame layout on the AMDGPU targets.
+///
+/// It holds the direction of the stack growth, the known stack alignment on
+/// entry to each function, and the offset to the locals area.
+/// See TargetFrameInfo for more comments.
+class AMDGPUFrameLowering : public TargetFrameLowering {
+public:
+  AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
+                      unsigned TransAl = 1);
+  virtual ~AMDGPUFrameLowering();
+
+  /// \returns The number of 32-bit sub-registers that are used when storing
+  /// values to the stack.
+  unsigned getStackWidth(const MachineFunction &MF) const;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  const SpillSlot *
+    getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  bool hasFP(const MachineFunction &MF) const override;
+};
+} // namespace llvm
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
new file mode 100644
index 00000000000..df4461eac4d
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -0,0 +1,1371 @@
+//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Defines an instruction selector for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUISelLowering.h" // For AMDGPUISD
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "SIDefines.h"
+#include "SIISelLowering.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// AMDGPU specific code to select AMDGPU machine instructions for
+/// SelectionDAG operations.
+class AMDGPUDAGToDAGISel : public SelectionDAGISel {
+  // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
+  // make the right decision when generating code for different targets.
+  const AMDGPUSubtarget *Subtarget;
+public:
+  AMDGPUDAGToDAGISel(TargetMachine &TM);
+  virtual ~AMDGPUDAGToDAGISel();
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  SDNode *Select(SDNode *N) override;
+  const char *getPassName() const override;
+  void PostprocessISelDAG() override;
+
+private:
+  bool isInlineImmediate(SDNode *N) const;
+  bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
+                   const R600InstrInfo *TII);
+  bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+  bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+
+  // Complex pattern selectors
+  bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
+  bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
+  bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
+
+  static bool checkType(const Value *ptr, unsigned int addrspace);
+  static bool checkPrivateAddress(const MachineMemOperand *Op);
+
+  static bool isGlobalStore(const StoreSDNode *N);
+  static bool isFlatStore(const StoreSDNode *N);
+  static bool isPrivateStore(const StoreSDNode *N);
+  static bool isLocalStore(const StoreSDNode *N);
+  static bool isRegionStore(const StoreSDNode *N);
+
+  bool isCPLoad(const LoadSDNode *N) const;
+  bool isConstantLoad(const LoadSDNode *N, int cbID) const;
+  bool isGlobalLoad(const LoadSDNode *N) const;
+  bool isFlatLoad(const LoadSDNode *N) const;
+  bool isParamLoad(const LoadSDNode *N) const;
+  bool isPrivateLoad(const LoadSDNode *N) const;
+  bool isLocalLoad(const LoadSDNode *N) const;
+  bool isRegionLoad(const LoadSDNode *N) const;
+
+  SDNode *glueCopyToM0(SDNode *N) const;
+
+  const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
+  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
+  bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
+                                       SDValue& Offset);
+  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+                       unsigned OffsetBits) const;
+  bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
+  bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
+                                 SDValue &Offset1) const;
+  void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+                   SDValue &SOffset, SDValue &Offset, SDValue &Offen,
+                   SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
+                   SDValue &TFE) const;
+  bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+                         SDValue &SOffset, SDValue &Offset, SDValue &GLC,
+                         SDValue &SLC, SDValue &TFE) const;
+  bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+                         SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
+                         SDValue &SLC) const;
+  bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
+                          SDValue &SOffset, SDValue &ImmOffset) const;
+  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
+                         SDValue &Offset, SDValue &GLC, SDValue &SLC,
+                         SDValue &TFE) const;
+  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+                         SDValue &Offset, SDValue &GLC) const;
+  SDNode *SelectAddrSpaceCast(SDNode *N);
+  bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                       SDValue &Clamp, SDValue &Omod) const;
+
+  bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
+                            SDValue &Omod) const;
+  bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods,
+                                 SDValue &Clamp,
+                                 SDValue &Omod) const;
+
+  SDNode *SelectADD_SUB_I64(SDNode *N);
+  SDNode *SelectDIV_SCALE(SDNode *N);
+
+  SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
+                   uint32_t Offset, uint32_t Width);
+  SDNode *SelectS_BFEFromShifts(SDNode *N);
+  SDNode *SelectS_BFE(SDNode *N);
+
+  // Include the pieces autogenerated from the target description.
+#include "AMDGPUGenDAGISel.inc"
+};
+}  // end anonymous namespace
+
+/// \brief This pass converts a legalized DAG into a AMDGPU-specific
+// DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) {
+  return new AMDGPUDAGToDAGISel(TM);
+}
+
+AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
+    : SelectionDAGISel(TM) {}
+
+bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &static_cast<const AMDGPUSubtarget &>(MF.getSubtarget());
+  return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
+AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
+}
+
+bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const {
+  const SITargetLowering *TL
+      = static_cast<const SITargetLowering *>(getTargetLowering());
+  return TL->analyzeImmediate(N) == 0;
+}
+
+/// \brief Determine the register class for \p OpNo
+/// \returns The register class of the virtual register that will be used for
+/// the given operand number \OpNo or NULL if the register class cannot be
+/// determined.
+const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
+                                                          unsigned OpNo) const {
+  if (!N->isMachineOpcode())
+    return nullptr;
+
+  switch (N->getMachineOpcode()) {
+  default: {
+    const MCInstrDesc &Desc =
+        Subtarget->getInstrInfo()->get(N->getMachineOpcode());
+    unsigned OpIdx = Desc.getNumDefs() + OpNo;
+    if (OpIdx >= Desc.getNumOperands())
+      return nullptr;
+    int RegClass = Desc.OpInfo[OpIdx].RegClass;
+    if (RegClass == -1)
+      return nullptr;
+
+    return Subtarget->getRegisterInfo()->getRegClass(RegClass);
+  }
+  case AMDGPU::REG_SEQUENCE: {
+    unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    const TargetRegisterClass *SuperRC =
+        Subtarget->getRegisterInfo()->getRegClass(RCID);
+
+    SDValue SubRegOp = N->getOperand(OpNo + 1);
+    unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
+    return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
+                                                              SubRegIdx);
+  }
+  }
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRParam(
+  SDValue Addr, SDValue& R1, SDValue& R2) {
+
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+    } else {
+      R1 = Addr;
+      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+    }
+  } else if (Addr.getOpcode() == ISD::ADD) {
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+  } else {
+    R1 = Addr;
+    R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+  }
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+  return SelectADDRParam(Addr, R1, R2);
+}
+
+
+bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
+    } else {
+      R1 = Addr;
+      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
+    }
+  } else if (Addr.getOpcode() == ISD::ADD) {
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+  } else {
+    R1 = Addr;
+    R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
+  }
+  return true;
+}
+
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
+  if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+      !checkType(cast<MemSDNode>(N)->getMemOperand()->getValue(),
+                 AMDGPUAS::LOCAL_ADDRESS))
+    return N;
+
+  const SITargetLowering& Lowering =
+      *static_cast<const SITargetLowering*>(getTargetLowering());
+
+  // Write max value to m0 before each load operation
+
+  SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
+                                 CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+
+  SDValue Glue = M0.getValue(1);
+
+  SmallVector <SDValue, 8> Ops;
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+     Ops.push_back(N->getOperand(i));
+  }
+  Ops.push_back(Glue);
+  CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
+
+  return N;
+}
+
+SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
+  unsigned int Opc = N->getOpcode();
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return nullptr;   // Already selected.
+  }
+
+  if (isa<AtomicSDNode>(N))
+    N = glueCopyToM0(N);
+
+  switch (Opc) {
+  default: break;
+  // We are selecting i64 ADD here instead of custom lower it during
+  // DAG legalization, so we can fold some i64 ADDs used for address
+  // calculation into the LOAD and STORE instructions.
+  case ISD::ADD:
+  case ISD::SUB: {
+    if (N->getValueType(0) != MVT::i64 ||
+        Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    return SelectADD_SUB_I64(N);
+  }
+  case ISD::SCALAR_TO_VECTOR:
+  case AMDGPUISD::BUILD_VERTICAL_VECTOR:
+  case ISD::BUILD_VECTOR: {
+    unsigned RegClassID;
+    const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+    EVT VT = N->getValueType(0);
+    unsigned NumVectorElts = VT.getVectorNumElements();
+    EVT EltVT = VT.getVectorElementType();
+    assert(EltVT.bitsEq(MVT::i32));
+    if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      bool UseVReg = true;
+      for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+                                                    U != E; ++U) {
+        if (!U->isMachineOpcode()) {
+          continue;
+        }
+        const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+        if (!RC) {
+          continue;
+        }
+        if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) {
+          UseVReg = false;
+        }
+      }
+      switch(NumVectorElts) {
+      case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID :
+                                     AMDGPU::SReg_32RegClassID;
+        break;
+      case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
+                                     AMDGPU::SReg_64RegClassID;
+        break;
+      case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
+                                     AMDGPU::SReg_128RegClassID;
+        break;
+      case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
+                                     AMDGPU::SReg_256RegClassID;
+        break;
+      case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
+                                      AMDGPU::SReg_512RegClassID;
+        break;
+      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+      }
+    } else {
+      // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+      // that adds a 128 bits reg copy when going through TwoAddressInstructions
+      // pass. We want to avoid 128 bits copies as much as possible because they
+      // can't be bundled by our scheduler.
+      switch(NumVectorElts) {
+      case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+      case 4:
+        if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+          RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
+        else
+          RegClassID = AMDGPU::R600_Reg128RegClassID;
+        break;
+      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+      }
+    }
+
+    SDLoc DL(N);
+    SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+
+    if (NumVectorElts == 1) {
+      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT,
+                                  N->getOperand(0), RegClass);
+    }
+
+    assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
+                                  "supported yet");
+    // 16 = Max Num Vector Elements
+    // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
+    // 1 = Vector Register Class
+    SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
+
+    RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+    bool IsRegSeq = true;
+    unsigned NOps = N->getNumOperands();
+    for (unsigned i = 0; i < NOps; i++) {
+      // XXX: Why is this here?
+      if (isa<RegisterSDNode>(N->getOperand(i))) {
+        IsRegSeq = false;
+        break;
+      }
+      RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
+      RegSeqArgs[1 + (2 * i) + 1] =
+              CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL,
+                                        MVT::i32);
+    }
+
+    if (NOps != NumVectorElts) {
+      // Fill in the missing undef elements if this was a scalar_to_vector.
+      assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
+
+      MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                     DL, EltVT);
+      for (unsigned i = NOps; i < NumVectorElts; ++i) {
+        RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
+        RegSeqArgs[1 + (2 * i) + 1] =
+          CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32);
+      }
+    }
+
+    if (!IsRegSeq)
+      break;
+    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
+                                RegSeqArgs);
+  }
+  case ISD::BUILD_PAIR: {
+    SDValue RC, SubReg0, SubReg1;
+    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+      break;
+    }
+    SDLoc DL(N);
+    if (N->getValueType(0) == MVT::i128) {
+      RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32);
+      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
+      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
+    } else if (N->getValueType(0) == MVT::i64) {
+      RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
+      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+    } else {
+      llvm_unreachable("Unhandled value type for BUILD_PAIR");
+    }
+    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
+                            N->getOperand(1), SubReg1 };
+    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
+                                  DL, N->getValueType(0), Ops);
+  }
+
+  case ISD::Constant:
+  case ISD::ConstantFP: {
+    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+        N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+      break;
+
+    uint64_t Imm;
+    if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
+      Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
+    else {
+      ConstantSDNode *C = cast<ConstantSDNode>(N);
+      Imm = C->getZExtValue();
+    }
+
+    SDLoc DL(N);
+    SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+                                CurDAG->getConstant(Imm & 0xFFFFFFFF, DL,
+                                                    MVT::i32));
+    SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+                                CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
+    const SDValue Ops[] = {
+      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
+      SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+      SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+    };
+
+    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
+                                  N->getValueType(0), Ops);
+  }
+
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    SDLoc SL(N);
+    EVT VT = N->getValueType(0);
+
+    if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) {
+      N = glueCopyToM0(N);
+      break;
+    }
+
+    // To simplify the TableGen patters, we replace all i64 loads with
+    // v2i32 loads.  Alternatively, we could promote i64 loads to v2i32
+    // during DAG legalization, however, so places (ExpandUnalignedLoad)
+    // in the DAG legalizer assume that if i64 is legal, so doing this
+    // promotion early can cause problems.
+
+    SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(),
+                                      LD->getBasePtr(), LD->getMemOperand());
+    SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
+                                      MVT::i64, NewLoad);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1));
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast);
+    SDNode *Load = glueCopyToM0(NewLoad.getNode());
+    SelectCode(Load);
+    N = BitCast.getNode();
+    break;
+  }
+
+  case ISD::STORE: {
+    // Handle i64 stores here for the same reason mentioned above for loads.
+    StoreSDNode *ST = cast<StoreSDNode>(N);
+    SDValue Value = ST->getValue();
+    if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) {
+
+      SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
+                                        MVT::v2i32, Value);
+      SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
+                                          ST->getBasePtr(), ST->getMemOperand());
+
+      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
+
+      if (NewValue.getOpcode() == ISD::BITCAST) {
+        Select(NewStore.getNode());
+        return SelectCode(NewValue.getNode());
+      }
+
+      // getNode() may fold the bitcast if its input was another bitcast.  If that
+      // happens we should only select the new store.
+      N = NewStore.getNode();
+    }
+
+    N = glueCopyToM0(N);
+    break;
+  }
+
+  case AMDGPUISD::REGISTER_LOAD: {
+    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+      break;
+    SDValue Addr, Offset;
+
+    SDLoc DL(N);
+    SelectADDRIndirect(N->getOperand(1), Addr, Offset);
+    const SDValue Ops[] = {
+      Addr,
+      Offset,
+      CurDAG->getTargetConstant(0, DL, MVT::i32),
+      N->getOperand(0),
+    };
+    return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL,
+                                  CurDAG->getVTList(MVT::i32, MVT::i64,
+                                                    MVT::Other),
+                                  Ops);
+  }
+  case AMDGPUISD::REGISTER_STORE: {
+    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+      break;
+    SDValue Addr, Offset;
+    SelectADDRIndirect(N->getOperand(2), Addr, Offset);
+    SDLoc DL(N);
+    const SDValue Ops[] = {
+      N->getOperand(1),
+      Addr,
+      Offset,
+      CurDAG->getTargetConstant(0, DL, MVT::i32),
+      N->getOperand(0),
+    };
+    return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL,
+                                        CurDAG->getVTList(MVT::Other),
+                                        Ops);
+  }
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    // There is a scalar version available, but unlike the vector version which
+    // has a separate operand for the offset and width, the scalar version packs
+    // the width and offset into a single operand. Try to move to the scalar
+    // version if the offsets are constant, so that we can try to keep extended
+    // loads of kernel arguments in SGPRs.
+
+    // TODO: Technically we could try to pattern match scalar bitshifts of
+    // dynamic values, but it's probably not useful.
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    bool Signed = Opc == AMDGPUISD::BFE_I32;
+
+    uint32_t OffsetVal = Offset->getZExtValue();
+    uint32_t WidthVal = Width->getZExtValue();
+
+    return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N),
+                    N->getOperand(0), OffsetVal, WidthVal);
+
+  }
+  case AMDGPUISD::DIV_SCALE: {
+    return SelectDIV_SCALE(N);
+  }
+  case ISD::CopyToReg: {
+    const SITargetLowering& Lowering =
+      *static_cast<const SITargetLowering*>(getTargetLowering());
+    Lowering.legalizeTargetIndependentNode(N, *CurDAG);
+    break;
+  }
+  case ISD::ADDRSPACECAST:
+    return SelectAddrSpaceCast(N);
+  case ISD::AND:
+  case ISD::SRL:
+  case ISD::SRA:
+    if (N->getValueType(0) != MVT::i32 ||
+        Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    return SelectS_BFE(N);
+  }
+
+  return SelectCode(N);
+}
+
+
+bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
+  assert(AS != 0 && "Use checkPrivateAddress instead.");
+  if (!Ptr)
+    return false;
+
+  return Ptr->getType()->getPointerAddressSpace() == AS;
+}
+
+bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) {
+  if (Op->getPseudoValue())
+    return true;
+
+  if (PointerType *PT = dyn_cast<PointerType>(Op->getValue()->getType()))
+    return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
+  const Value *MemVal = N->getMemOperand()->getValue();
+  return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
+          !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+          !checkType(MemVal, AMDGPUAS::REGION_ADDRESS));
+}
+
+bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const {
+  const Value *MemVal = N->getMemOperand()->getValue();
+  if (CbId == -1)
+    return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS);
+
+  return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
+}
+
+bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
+  if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
+    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+        N->getMemoryVT().bitsLT(MVT::i32))
+      return true;
+
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) const {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isFlatLoad(const  LoadSDNode *N) const {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) const {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
+  MachineMemOperand *MMO = N->getMemOperand();
+  if (checkPrivateAddress(N->getMemOperand())) {
+    if (MMO) {
+      const PseudoSourceValue *PSV = MMO->getPseudoValue();
+      if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
+  if (checkPrivateAddress(N->getMemOperand())) {
+    // Check to make sure we are not a constant pool load or a constant load
+    // that is marked as a private load
+    if (isCPLoad(N) || isConstantLoad(N, -1)) {
+      return false;
+    }
+  }
+
+  const Value *MemVal = N->getMemOperand()->getValue();
+  if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) {
+    return true;
+  }
+  return false;
+}
+
+const char *AMDGPUDAGToDAGISel::getPassName() const {
+  return "AMDGPU DAG->DAG Pattern Instruction Selection";
+}
+
+#ifdef DEBUGTMP
+#undef INT64_C
+#endif
+#undef DEBUGTMP
+
+//===----------------------------------------------------------------------===//
+// Complex Patterns
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
+                                                         SDValue& IntPtr) {
+  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
+    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
+                                       true);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
+    SDValue& BaseReg, SDValue &Offset) {
+  if (!isa<ConstantSDNode>(Addr)) {
+    BaseReg = Addr;
+    Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset) {
+  ConstantSDNode *IMMOffset;
+
+  if (Addr.getOpcode() == ISD::ADD
+      && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && isInt<16>(IMMOffset->getZExtValue())) {
+
+      Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
+                                         MVT::i32);
+      return true;
+  // If the pointer address is constant, we can move it to the offset field.
+  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
+             && isInt<16>(IMMOffset->getZExtValue())) {
+    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                  SDLoc(CurDAG->getEntryNode()),
+                                  AMDGPU::ZERO, MVT::i32);
+    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
+                                       MVT::i32);
+    return true;
+  }
+
+  // Default case, no offset
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  ConstantSDNode *C;
+  SDLoc DL(Addr);
+
+  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
+    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+  } else {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  }
+
+  return true;
+}
+
+SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
+  SDLoc DL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  bool IsAdd = (N->getOpcode() == ISD::ADD);
+
+  SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+  SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+
+  SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, LHS, Sub0);
+  SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, LHS, Sub1);
+
+  SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, RHS, Sub0);
+  SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, RHS, Sub1);
+
+  SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
+  SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
+
+
+  unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+  unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+
+  SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs);
+  SDValue Carry(AddLo, 1);
+  SDNode *AddHi
+    = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32,
+                             SDValue(Hi0, 0), SDValue(Hi1, 0), Carry);
+
+  SDValue Args[5] = {
+    CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
+    SDValue(AddLo,0),
+    Sub0,
+    SDValue(AddHi,0),
+    Sub1,
+  };
+  return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
+}
+
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
+SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
+  SDLoc SL(N);
+  EVT VT = N->getValueType(0);
+
+  assert(VT == MVT::f32 || VT == MVT::f64);
+
+  unsigned Opc
+    = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
+
+  // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+  SDValue Ops[8];
+
+  SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
+  SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
+  SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
+  return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
+}
+
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+                                         unsigned OffsetBits) const {
+  if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
+      (OffsetBits == 8 && !isUInt<8>(Offset)))
+    return false;
+
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
+    return true;
+
+  // On Southern Islands instruction with a negative base value and an offset
+  // don't seem to work.
+  return CurDAG->SignBitIsZero(Base);
+}
+
+bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
+                                              SDValue &Offset) const {
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+    if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
+      // (add n0, c0)
+      Base = N0;
+      Offset = N1;
+      return true;
+    }
+  }
+
+  SDLoc DL(Addr);
+
+  // If we have a constant address, prefer to put the constant into the
+  // offset. This can save moves to load the constant address since multiple
+  // operations can share the zero base address register, and enables merging
+  // into read2 / write2 instructions.
+  if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+    if (isUInt<16>(CAddr->getZExtValue())) {
+      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+      MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+                                 DL, MVT::i32, Zero);
+      Base = SDValue(MovZero, 0);
+      Offset = Addr;
+      return true;
+    }
+  }
+
+  // default case
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
+                                                   SDValue &Offset0,
+                                                   SDValue &Offset1) const {
+  SDLoc DL(Addr);
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+    unsigned DWordOffset0 = C1->getZExtValue() / 4;
+    unsigned DWordOffset1 = DWordOffset0 + 1;
+    // (add n0, c0)
+    if (isDSOffsetLegal(N0, DWordOffset1, 8)) {
+      Base = N0;
+      Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
+      Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+      return true;
+    }
+  }
+
+  if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+    unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
+    unsigned DWordOffset1 = DWordOffset0 + 1;
+    assert(4 * DWordOffset0 == CAddr->getZExtValue());
+
+    if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) {
+      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+      MachineSDNode *MovZero
+        = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+                                 DL, MVT::i32, Zero);
+      Base = SDValue(MovZero, 0);
+      Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
+      Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+      return true;
+    }
+  }
+
+  // default case
+  Base = Addr;
+  Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
+  Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
+  return true;
+}
+
+static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
+  return isUInt<12>(Imm->getZExtValue());
+}
+
+void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
+                                     SDValue &VAddr, SDValue &SOffset,
+                                     SDValue &Offset, SDValue &Offen,
+                                     SDValue &Idxen, SDValue &Addr64,
+                                     SDValue &GLC, SDValue &SLC,
+                                     SDValue &TFE) const {
+  SDLoc DL(Addr);
+
+  GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
+
+  Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+
+    if (N0.getOpcode() == ISD::ADD) {
+      // (add (add N2, N3), C1) -> addr64
+      SDValue N2 = N0.getOperand(0);
+      SDValue N3 = N0.getOperand(1);
+      Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+      Ptr = N2;
+      VAddr = N3;
+    } else {
+
+      // (add N0, C1) -> offset
+      VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
+      Ptr = N0;
+    }
+
+    if (isLegalMUBUFImmOffset(C1)) {
+        Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+        return;
+    } else if (isUInt<32>(C1->getZExtValue())) {
+      // Illegal offset, store it in soffset.
+      Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+      SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+                   CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
+                        0);
+      return;
+    }
+  }
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    // (add N0, N1) -> addr64
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+    Ptr = N0;
+    VAddr = N1;
+    Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+    return;
+  }
+
+  // default case -> offset
+  VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  Ptr = Addr;
+  Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &VAddr, SDValue &SOffset,
+                                           SDValue &Offset, SDValue &GLC,
+                                           SDValue &SLC, SDValue &TFE) const {
+  SDValue Ptr, Offen, Idxen, Addr64;
+
+  SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+              GLC, SLC, TFE);
+
+  ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
+  if (C->getSExtValue()) {
+    SDLoc DL(Addr);
+
+    const SITargetLowering& Lowering =
+      *static_cast<const SITargetLowering*>(getTargetLowering());
+
+    SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
+    return true;
+  }
+
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &VAddr, SDValue &SOffset,
+					   SDValue &Offset,
+					   SDValue &SLC) const {
+  SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
+  SDValue GLC, TFE;
+
+  return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
+                                            SDValue &VAddr, SDValue &SOffset,
+                                            SDValue &ImmOffset) const {
+
+  SDLoc DL(Addr);
+  MachineFunction &MF = CurDAG->getMachineFunction();
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SITargetLowering& Lowering =
+    *static_cast<const SITargetLowering*>(getTargetLowering());
+
+  unsigned ScratchOffsetReg =
+      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+  Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
+                                ScratchOffsetReg, MVT::i32);
+  SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32);
+  SDValue ScratchRsrcDword0 =
+      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0);
+
+  SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32);
+  SDValue ScratchRsrcDword1 =
+      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0);
+
+  const SDValue RsrcOps[] = {
+      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
+      ScratchRsrcDword0,
+      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+      ScratchRsrcDword1,
+      CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
+  };
+  SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                              MVT::v2i32, RsrcOps), 0);
+  Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
+  SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+      MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
+
+  // (add n0, c1)
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N1 = Addr.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+
+    if (isLegalMUBUFImmOffset(C1)) {
+      VAddr = Addr.getOperand(0);
+      ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+      return true;
+    }
+  }
+
+  // (node)
+  VAddr = Addr;
+  ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &SOffset, SDValue &Offset,
+                                           SDValue &GLC, SDValue &SLC,
+                                           SDValue &TFE) const {
+  SDValue Ptr, VAddr, Offen, Idxen, Addr64;
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+  SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+              GLC, SLC, TFE);
+
+  if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
+      !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
+      !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
+    uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
+                    APInt::getAllOnesValue(32).getZExtValue(); // Size
+    SDLoc DL(Addr);
+
+    const SITargetLowering& Lowering =
+      *static_cast<const SITargetLowering*>(getTargetLowering());
+
+    SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &Soffset, SDValue &Offset,
+                                           SDValue &GLC) const {
+  SDValue SLC, TFE;
+
+  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
+}
+
+// FIXME: This is incorrect and only enough to be able to compile.
+SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
+  AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
+  SDLoc DL(N);
+
+  assert(Subtarget->hasFlatAddressSpace() &&
+         "addrspacecast only supported with flat address space!");
+
+  assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+          ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
+         "Cannot cast address space to / from constant address!");
+
+  assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+          ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
+         "Can only cast to / from flat address space!");
+
+  // The flat instructions read the address as the index of the VGPR holding the
+  // address, so casting should just be reinterpreting the base VGPR, so just
+  // insert trunc / bitcast / zext.
+
+  SDValue Src = ASC->getOperand(0);
+  EVT DestVT = ASC->getValueType(0);
+  EVT SrcVT = Src.getValueType();
+
+  unsigned SrcSize = SrcVT.getSizeInBits();
+  unsigned DestSize = DestVT.getSizeInBits();
+
+  if (SrcSize > DestSize) {
+    assert(SrcSize == 64 && DestSize == 32);
+    return CurDAG->getMachineNode(
+      TargetOpcode::EXTRACT_SUBREG,
+      DL,
+      DestVT,
+      Src,
+      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32));
+  }
+
+
+  if (DestSize > SrcSize) {
+    assert(SrcSize == 32 && DestSize == 64);
+
+    // FIXME: This is probably wrong, we should never be defining
+    // a register class with both VGPRs and SGPRs
+    SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL,
+                                           MVT::i32);
+
+    const SDValue Ops[] = {
+      RC,
+      Src,
+      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+                                     CurDAG->getConstant(0, DL, MVT::i32)), 0),
+      CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+    };
+
+    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
+                                  DL, N->getValueType(0), Ops);
+  }
+
+  assert(SrcSize == 64 && DestSize == 64);
+  return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode();
+}
+
+SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
+                                     uint32_t Offset, uint32_t Width) {
+  // Transformation function, pack the offset and width of a BFE into
+  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+  // source, bits [5:0] contain the offset and bits [22:16] the width.
+  uint32_t PackedVal = Offset | (Width << 16);
+  SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
+
+  return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
+}
+
+SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
+  // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
+  // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
+  // Predicate: 0 < b <= c < 32
+
+  const SDValue &Shl = N->getOperand(0);
+  ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+
+  if (B && C) {
+    uint32_t BVal = B->getZExtValue();
+    uint32_t CVal = C->getZExtValue();
+
+    if (0 < BVal && BVal <= CVal && CVal < 32) {
+      bool Signed = N->getOpcode() == ISD::SRA;
+      unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
+
+      return getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0),
+                      CVal - BVal, 32 - CVal);
+    }
+  }
+  return SelectCode(N);
+}
+
+SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
+  switch (N->getOpcode()) {
+  case ISD::AND:
+    if (N->getOperand(0).getOpcode() == ISD::SRL) {
+      // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
+      // Predicate: isMask(mask)
+      const SDValue &Srl = N->getOperand(0);
+      ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
+      ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
+
+      if (Shift && Mask) {
+        uint32_t ShiftVal = Shift->getZExtValue();
+        uint32_t MaskVal = Mask->getZExtValue();
+
+        if (isMask_32(MaskVal)) {
+          uint32_t WidthVal = countPopulation(MaskVal);
+
+          return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), Srl.getOperand(0),
+                          ShiftVal, WidthVal);
+        }
+      }
+    }
+    break;
+  case ISD::SRL:
+    if (N->getOperand(0).getOpcode() == ISD::AND) {
+      // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
+      // Predicate: isMask(mask >> b)
+      const SDValue &And = N->getOperand(0);
+      ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
+      ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
+
+      if (Shift && Mask) {
+        uint32_t ShiftVal = Shift->getZExtValue();
+        uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
+
+        if (isMask_32(MaskVal)) {
+          uint32_t WidthVal = countPopulation(MaskVal);
+
+          return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), And.getOperand(0),
+                          ShiftVal, WidthVal);
+        }
+      }
+    } else if (N->getOperand(0).getOpcode() == ISD::SHL)
+      return SelectS_BFEFromShifts(N);
+    break;
+  case ISD::SRA:
+    if (N->getOperand(0).getOpcode() == ISD::SHL)
+      return SelectS_BFEFromShifts(N);
+    break;
+  }
+
+  return SelectCode(N);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
+                                        SDValue &SrcMods) const {
+
+  unsigned Mods = 0;
+
+  Src = In;
+
+  if (Src.getOpcode() == ISD::FNEG) {
+    Mods |= SISrcMods::NEG;
+    Src = Src.getOperand(0);
+  }
+
+  if (Src.getOpcode() == ISD::FABS) {
+    Mods |= SISrcMods::ABS;
+    Src = Src.getOperand(0);
+  }
+
+  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
+                                         SDValue &SrcMods, SDValue &Clamp,
+                                         SDValue &Omod) const {
+  SDLoc DL(In);
+  // FIXME: Handle Clamp and Omod
+  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  Omod = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+  return SelectVOP3Mods(In, Src, SrcMods);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,
+                                              SDValue &SrcMods,
+                                              SDValue &Omod) const {
+  // FIXME: Handle Omod
+  Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
+
+  return SelectVOP3Mods(In, Src, SrcMods);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
+                                                   SDValue &SrcMods,
+                                                   SDValue &Clamp,
+                                                   SDValue &Omod) const {
+  Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
+  return SelectVOP3Mods(In, Src, SrcMods);
+}
+
+void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
+  const AMDGPUTargetLowering& Lowering =
+    *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
+  bool IsModified = false;
+  do {
+    IsModified = false;
+    // Go over all selected nodes and try to fold them a bit more
+    for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+         E = CurDAG->allnodes_end(); I != E; ++I) {
+
+      SDNode *Node = I;
+
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
+      if (!MachineNode)
+        continue;
+
+      SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
+      if (ResNode != Node) {
+        ReplaceUses(Node, ResNode);
+        IsModified = true;
+      }
+    }
+    CurDAG->RemoveDeadNodes();
+  } while (IsModified);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
new file mode 100644
index 00000000000..d56838ec201
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -0,0 +1,2866 @@
+//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This is the parent TargetLowering class for hardware code gen
+/// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUISelLowering.h"
+#include "AMDGPU.h"
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600MachineFunctionInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+
+using namespace llvm;
+
+namespace {
+
+/// Diagnostic information for unimplemented or unsupported feature reporting.
+class DiagnosticInfoUnsupported : public DiagnosticInfo {
+private:
+  const Twine &Description;
+  const Function &Fn;
+
+  static int KindID;
+
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+
+public:
+  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
+                          DiagnosticSeverity Severity = DS_Error)
+    : DiagnosticInfo(getKindID(), Severity),
+      Description(Desc),
+      Fn(Fn) { }
+
+  const Function &getFunction() const { return Fn; }
+  const Twine &getDescription() const { return Description; }
+
+  void print(DiagnosticPrinter &DP) const override {
+    DP << "unsupported " << getDescription() << " in " << Fn.getName();
+  }
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+
+int DiagnosticInfoUnsupported::KindID = 0;
+}
+
+
+static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
+                      CCValAssign::LocInfo LocInfo,
+                      ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  unsigned Offset = State.AllocateStack(ValVT.getStoreSize(),
+                                        ArgFlags.getOrigAlign());
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+
+  return true;
+}
+
+#include "AMDGPUGenCallingConv.inc"
+
+// Find a larger type to do a load / store of a vector with.
+EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
+  unsigned StoreSize = VT.getStoreSizeInBits();
+  if (StoreSize <= 32)
+    return EVT::getIntegerVT(Ctx, StoreSize);
+
+  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
+  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+}
+
+// Type for a vector that will be loaded to.
+EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
+  unsigned StoreSize = VT.getStoreSizeInBits();
+  if (StoreSize <= 32)
+    return EVT::getIntegerVT(Ctx, 32);
+
+  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+}
+
+AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
+                                           const AMDGPUSubtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
+  setOperationAction(ISD::Constant, MVT::i32, Legal);
+  setOperationAction(ISD::Constant, MVT::i64, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+  // We need to custom lower some of the intrinsics
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // Library functions.  These default to Expand, but we have instructions
+  // for them.
+  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
+  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
+  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
+  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
+  setOperationAction(ISD::FABS,   MVT::f32, Legal);
+  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
+  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+
+  setOperationAction(ISD::FROUND, MVT::f32, Custom);
+  setOperationAction(ISD::FROUND, MVT::f64, Custom);
+
+  setOperationAction(ISD::FREM, MVT::f32, Custom);
+  setOperationAction(ISD::FREM, MVT::f64, Custom);
+
+  // v_mad_f32 does not support denormals according to some sources.
+  if (!Subtarget->hasFP32Denormals())
+    setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+  // Expand to fneg + fadd.
+  setOperationAction(ISD::FSUB, MVT::f64, Expand);
+
+  // Lower floating point store/load to integer store/load to reduce the number
+  // of patterns in tablegen.
+  setOperationAction(ISD::STORE, MVT::f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
+
+  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+
+  setOperationAction(ISD::STORE, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
+
+  setOperationAction(ISD::STORE, MVT::v8f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
+
+  setOperationAction(ISD::STORE, MVT::v16f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
+
+  setOperationAction(ISD::STORE, MVT::f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
+
+  setOperationAction(ISD::STORE, MVT::v2f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
+
+  // Custom lowering of vector stores is required for local address space
+  // stores.
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+
+  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+
+  // XXX: This can be change to Custom, once ExpandVectorStores can
+  // handle 64-bit stores.
+  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+
+  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+  setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
+  setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
+
+
+  setOperationAction(ISD::LOAD, MVT::f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
+
+  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+
+  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
+
+  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
+
+  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
+
+  setOperationAction(ISD::LOAD, MVT::f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
+
+  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
+
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
+
+  // There are no 64-bit extloads. These should be done as a 32-bit extload and
+  // an extension to 64-bit.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
+  }
+
+  for (MVT VT : MVT::integer_vector_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
+  }
+
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+
+  if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+    setOperationAction(ISD::FRINT, MVT::f64, Custom);
+    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
+  }
+
+  if (!Subtarget->hasBFI()) {
+    // fcopysign can be done in a single instruction with BFI.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  }
+
+  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
+
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+  setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
+  setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+  for (MVT VT : ScalarIntVTs) {
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Expand);
+
+    // GPU does not have divrem function for signed or unsigned.
+    setOperationAction(ISD::SDIVREM, VT, Custom);
+    setOperationAction(ISD::UDIVREM, VT, Custom);
+
+    // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+  }
+
+  if (!Subtarget->hasBCNT(32))
+    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+  if (!Subtarget->hasBCNT(64))
+    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+  // The hardware supports 32-bit ROTR, but not ROTL.
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+  setOperationAction(ISD::ROTR, MVT::i64, Expand);
+
+  setOperationAction(ISD::MUL, MVT::i64, Expand);
+  setOperationAction(ISD::MULHU, MVT::i64, Expand);
+  setOperationAction(ISD::MULHS, MVT::i64, Expand);
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+
+  setOperationAction(ISD::SMIN, MVT::i32, Legal);
+  setOperationAction(ISD::UMIN, MVT::i32, Legal);
+  setOperationAction(ISD::SMAX, MVT::i32, Legal);
+  setOperationAction(ISD::UMAX, MVT::i32, Legal);
+
+  if (!Subtarget->hasFFBH())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+
+  if (!Subtarget->hasFFBL())
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+
+  static const MVT::SimpleValueType VectorIntTypes[] = {
+    MVT::v2i32, MVT::v4i32
+  };
+
+  for (MVT VT : VectorIntTypes) {
+    // Expand the following operations for the current type by default.
+    setOperationAction(ISD::ADD,  VT, Expand);
+    setOperationAction(ISD::AND,  VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+    setOperationAction(ISD::MUL,  VT, Expand);
+    setOperationAction(ISD::OR,   VT, Expand);
+    setOperationAction(ISD::SHL,  VT, Expand);
+    setOperationAction(ISD::SRA,  VT, Expand);
+    setOperationAction(ISD::SRL,  VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::SUB,  VT, Expand);
+    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Custom);
+    setOperationAction(ISD::UDIVREM, VT, Custom);
+    setOperationAction(ISD::ADDC, VT, Expand);
+    setOperationAction(ISD::SUBC, VT, Expand);
+    setOperationAction(ISD::ADDE, VT, Expand);
+    setOperationAction(ISD::SUBE, VT, Expand);
+    setOperationAction(ISD::SELECT, VT, Expand);
+    setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::XOR,  VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+  }
+
+  static const MVT::SimpleValueType FloatVectorTypes[] = {
+    MVT::v2f32, MVT::v4f32
+  };
+
+  for (MVT VT : FloatVectorTypes) {
+    setOperationAction(ISD::FABS, VT, Expand);
+    setOperationAction(ISD::FMINNUM, VT, Expand);
+    setOperationAction(ISD::FMAXNUM, VT, Expand);
+    setOperationAction(ISD::FADD, VT, Expand);
+    setOperationAction(ISD::FCEIL, VT, Expand);
+    setOperationAction(ISD::FCOS, VT, Expand);
+    setOperationAction(ISD::FDIV, VT, Expand);
+    setOperationAction(ISD::FEXP2, VT, Expand);
+    setOperationAction(ISD::FLOG2, VT, Expand);
+    setOperationAction(ISD::FREM, VT, Expand);
+    setOperationAction(ISD::FPOW, VT, Expand);
+    setOperationAction(ISD::FFLOOR, VT, Expand);
+    setOperationAction(ISD::FTRUNC, VT, Expand);
+    setOperationAction(ISD::FMUL, VT, Expand);
+    setOperationAction(ISD::FMA, VT, Expand);
+    setOperationAction(ISD::FRINT, VT, Expand);
+    setOperationAction(ISD::FNEARBYINT, VT, Expand);
+    setOperationAction(ISD::FSQRT, VT, Expand);
+    setOperationAction(ISD::FSIN, VT, Expand);
+    setOperationAction(ISD::FSUB, VT, Expand);
+    setOperationAction(ISD::FNEG, VT, Expand);
+    setOperationAction(ISD::SELECT, VT, Expand);
+    setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+  }
+
+  setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
+  setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
+
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::STORE);
+
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
+
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  setSchedulingPreference(Sched::RegPressure);
+  setJumpIsExpensive(true);
+
+  // SI at least has hardware support for floating point exceptions, but no way
+  // of using or handling them is implemented. They are also optional in OpenCL
+  // (Section 7.3)
+  setHasFloatingPointExceptions(false);
+
+  setSelectIsExpensive(false);
+  PredictableSelectIsExpensive = false;
+
+  // There are no integer divide instructions, and these expand to a pretty
+  // large sequence of instructions.
+  setIntDivIsCheap(false);
+  setPow2SDivIsCheap(false);
+  setFsqrtIsCheap(true);
+
+  // FIXME: Need to really handle these.
+  MaxStoresPerMemcpy  = 4096;
+  MaxStoresPerMemmove = 4096;
+  MaxStoresPerMemset  = 4096;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Information
+//===----------------------------------------------------------------------===//
+
+MVT AMDGPUTargetLowering::getVectorIdxTy() const {
+  return MVT::i32;
+}
+
+bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
+  return true;
+}
+
+// The backend supports 32 and 64 bit floating point immediates.
+// FIXME: Why are we reporting vectors of FP immediates as legal?
+bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  EVT ScalarVT = VT.getScalarType();
+  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64);
+}
+
+// We don't want to shrink f64 / f32 constants.
+bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
+  EVT ScalarVT = VT.getScalarType();
+  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
+}
+
+bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
+                                                 ISD::LoadExtType,
+                                                 EVT NewVT) const {
+
+  unsigned NewSize = NewVT.getStoreSizeInBits();
+
+  // If we are reducing to a 32-bit load, this is always better.
+  if (NewSize == 32)
+    return true;
+
+  EVT OldVT = N->getValueType(0);
+  unsigned OldSize = OldVT.getStoreSizeInBits();
+
+  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
+  // extloads, so doing one requires using a buffer_load. In cases where we
+  // still couldn't use a scalar load, using the wider load shouldn't really
+  // hurt anything.
+
+  // If the old size already had to be an extload, there's no harm in continuing
+  // to reduce the width.
+  return (OldSize < 32);
+}
+
+bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
+                                                   EVT CastTy) const {
+  if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
+    return true;
+
+  unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
+  unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();
+
+  return ((LScalarSize <= CastScalarSize) ||
+          (CastScalarSize >= 32) ||
+          (LScalarSize < 32));
+}
+
+// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
+// profitable with the expansion for 64-bit since it's generally good to
+// speculate things.
+// FIXME: These should really have the size as a parameter.
+bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
+  return true;
+}
+
+bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
+  return true;
+}
+
+//===---------------------------------------------------------------------===//
+// Target Properties
+//===---------------------------------------------------------------------===//
+
+bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
+  assert(VT.isFloatingPoint());
+  return VT == MVT::f32 || VT == MVT::f64;
+}
+
+bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
+  assert(VT.isFloatingPoint());
+  return VT == MVT::f32 || VT == MVT::f64;
+}
+
+bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
+                                                         unsigned NumElem,
+                                                         unsigned AS) const {
+  return true;
+}
+
+bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
+  // Truncate is just accessing a subregister.
+  return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
+}
+
+bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
+  // Truncate is just accessing a subregister.
+  return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
+         (Dest->getPrimitiveSizeInBits() % 32 == 0);
+}
+
+bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
+  const DataLayout *DL = getDataLayout();
+  unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType());
+  unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType());
+
+  return SrcSize == 32 && DestSize == 64;
+}
+
+bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
+  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
+  // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
+  // this will enable reducing 64-bit operations the 32-bit, which is always
+  // good.
+  return Src == MVT::i32 && Dest == MVT::i64;
+}
+
+bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  return isZExtFree(Val.getValueType(), VT2);
+}
+
+bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
+  // limited number of native 64-bit operations. Shrinking an operation to fit
+  // in a single 32-bit register should always be helpful. As currently used,
+  // this is much less general than the name suggests, and is only used in
+  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
+  // not profitable, and may actually be harmful.
+  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+}
+
+//===---------------------------------------------------------------------===//
+// TargetLowering Callbacks
+//===---------------------------------------------------------------------===//
+
+void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
+                             const SmallVectorImpl<ISD::InputArg> &Ins) const {
+
+  State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
+}
+
+SDValue AMDGPUTargetLowering::LowerReturn(
+                                     SDValue Chain,
+                                     CallingConv::ID CallConv,
+                                     bool isVarArg,
+                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                     const SmallVectorImpl<SDValue> &OutVals,
+                                     SDLoc DL, SelectionDAG &DAG) const {
+  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
+}
+
+//===---------------------------------------------------------------------===//
+// Target specific lowering
+//===---------------------------------------------------------------------===//
+
+SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                        SmallVectorImpl<SDValue> &InVals) const {
+  SDValue Callee = CLI.Callee;
+  SelectionDAG &DAG = CLI.DAG;
+
+  const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+  StringRef FuncName("<unknown>");
+
+  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
+    FuncName = G->getSymbol();
+  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    FuncName = G->getGlobal()->getName();
+
+  DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
+  DAG.getContext()->diagnose(NoCalls);
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    Op.getNode()->dump();
+    llvm_unreachable("Custom lowering code for this"
+                     "instruction is not implemented yet!");
+    break;
+  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
+  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
+  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
+  case ISD::FREM: return LowerFREM(Op, DAG);
+  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
+  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
+  case ISD::FRINT: return LowerFRINT(Op, DAG);
+  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
+  case ISD::FROUND: return LowerFROUND(Op, DAG);
+  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
+  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
+  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
+  case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+  }
+  return Op;
+}
+
+void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
+                                              SmallVectorImpl<SDValue> &Results,
+                                              SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  case ISD::SIGN_EXTEND_INREG:
+    // Different parts of legalization seem to interpret which type of
+    // sign_extend_inreg is the one to check for custom lowering. The extended
+    // from type is what really matters, but some places check for custom
+    // lowering of the result type. This results in trying to use
+    // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
+    // nothing here and let the illegal result integer be handled normally.
+    return;
+  case ISD::LOAD: {
+    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
+    if (!Node)
+      return;
+
+    Results.push_back(SDValue(Node, 0));
+    Results.push_back(SDValue(Node, 1));
+    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
+    // function
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
+    return;
+  }
+  case ISD::STORE: {
+    SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG);
+    if (Lowered.getNode())
+      Results.push_back(Lowered);
+    return;
+  }
+  default:
+    return;
+  }
+}
+
+// FIXME: This implements accesses to initialized globals in the constant
+// address space by copying them to private and accessing that. It does not
+// properly handle illegal types or vectors. The private vector loads are not
+// scalarized, and the illegal scalars hit an assertion. This technique will not
+// work well with large initializers, and this should eventually be
+// removed. Initialized globals should be placed into a data section that the
+// runtime will load into a buffer before the kernel is executed. Uses of the
+// global need to be replaced with a pointer loaded from an implicit kernel
+// argument into this buffer holding the copy of the data, which will remove the
+// need for any of this.
+SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
+                                                       const GlobalValue *GV,
+                                                       const SDValue &InitPtr,
+                                                       SDValue Chain,
+                                                       SelectionDAG &DAG) const {
+  const DataLayout *TD = getDataLayout();
+  SDLoc DL(InitPtr);
+  Type *InitTy = Init->getType();
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
+    EVT VT = EVT::getEVT(InitTy);
+    PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
+    return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr,
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+                        TD->getPrefTypeAlignment(InitTy));
+  }
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
+    EVT VT = EVT::getEVT(CFP->getType());
+    PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
+    return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr,
+                 MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+                 TD->getPrefTypeAlignment(CFP->getType()));
+  }
+
+  if (StructType *ST = dyn_cast<StructType>(InitTy)) {
+    const StructLayout *SL = TD->getStructLayout(ST);
+
+    EVT PtrVT = InitPtr.getValueType();
+    SmallVector<SDValue, 8> Chains;
+
+    for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
+      SDValue Offset = DAG.getConstant(SL->getElementOffset(I), DL, PtrVT);
+      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
+
+      Constant *Elt = Init->getAggregateElement(I);
+      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
+    }
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
+    EVT PtrVT = InitPtr.getValueType();
+
+    unsigned NumElements;
+    if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
+      NumElements = AT->getNumElements();
+    else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
+      NumElements = VT->getNumElements();
+    else
+      llvm_unreachable("Unexpected type");
+
+    unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
+    SmallVector<SDValue, 8> Chains;
+    for (unsigned i = 0; i < NumElements; ++i) {
+      SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT);
+      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
+
+      Constant *Elt = Init->getAggregateElement(i);
+      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
+    }
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  if (isa<UndefValue>(Init)) {
+    EVT VT = EVT::getEVT(InitTy);
+    PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
+    return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+                        TD->getPrefTypeAlignment(InitTy));
+  }
+
+  Init->dump();
+  llvm_unreachable("Unhandled constant initializer");
+}
+
+static bool hasDefinedInitializer(const GlobalValue *GV) {
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar || !GVar->hasInitializer())
+    return false;
+
+  if (isa<UndefValue>(GVar->getInitializer()))
+    return false;
+
+  return true;
+}
+
+SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
+                                                 SDValue Op,
+                                                 SelectionDAG &DAG) const {
+
+  const DataLayout *TD = getDataLayout();
+  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = G->getGlobal();
+
+  switch (G->getAddressSpace()) {
+  case AMDGPUAS::LOCAL_ADDRESS: {
+    // XXX: What does the value of G->getOffset() mean?
+    assert(G->getOffset() == 0 &&
+         "Do not know what to do with an non-zero offset");
+
+    // TODO: We could emit code to handle the initialization somewhere.
+    if (hasDefinedInitializer(GV))
+      break;
+
+    unsigned Offset;
+    if (MFI->LocalMemoryObjects.count(GV) == 0) {
+      uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+      Offset = MFI->LDSSize;
+      MFI->LocalMemoryObjects[GV] = Offset;
+      // XXX: Account for alignment?
+      MFI->LDSSize += Size;
+    } else {
+      Offset = MFI->LocalMemoryObjects[GV];
+    }
+
+    return DAG.getConstant(Offset, SDLoc(Op),
+                           getPointerTy(AMDGPUAS::LOCAL_ADDRESS));
+  }
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+    Type *EltType = GV->getType()->getElementType();
+    unsigned Size = TD->getTypeAllocSize(EltType);
+    unsigned Alignment = TD->getPrefTypeAlignment(EltType);
+
+    MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS);
+    MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
+
+    int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
+    SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
+
+    const GlobalVariable *Var = cast<GlobalVariable>(GV);
+    if (!Var->hasInitializer()) {
+      // This has no use, but bugpoint will hit it.
+      return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
+    }
+
+    const Constant *Init = Var->getInitializer();
+    SmallVector<SDNode*, 8> WorkList;
+
+    for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
+                              E = DAG.getEntryNode()->use_end(); I != E; ++I) {
+      if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD)
+        continue;
+      WorkList.push_back(*I);
+    }
+    SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG);
+    for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(),
+                                           E = WorkList.end(); I != E; ++I) {
+      SmallVector<SDValue, 8> Ops;
+      Ops.push_back(Chain);
+      for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
+        Ops.push_back((*I)->getOperand(i));
+      }
+      DAG.UpdateNodeOperands(*I, Ops);
+    }
+    return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
+  }
+  }
+
+  const Function &Fn = *DAG.getMachineFunction().getFunction();
+  DiagnosticInfoUnsupported BadInit(Fn,
+                                    "initializer for address space");
+  DAG.getContext()->diagnose(BadInit);
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SmallVector<SDValue, 8> Args;
+
+  for (const SDUse &U : Op->ops())
+    DAG.ExtractVectorElements(U.get(), Args);
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
+}
+
+SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+
+  SmallVector<SDValue, 8> Args;
+  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  EVT VT = Op.getValueType();
+  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
+                            VT.getVectorNumElements());
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
+}
+
+SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
+                                              SelectionDAG &DAG) const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering();
+
+  FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
+
+  unsigned FrameIndex = FIN->getIndex();
+  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
+  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
+                         Op.getValueType());
+}
+
+SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+    SelectionDAG &DAG) const {
+  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  switch (IntrinsicID) {
+    default: return Op;
+    case AMDGPUIntrinsic::AMDGPU_abs:
+    case AMDGPUIntrinsic::AMDIL_abs: // Legacy name.
+      return LowerIntrinsicIABS(Op, DAG);
+    case AMDGPUIntrinsic::AMDGPU_lrp:
+      return LowerIntrinsicLRP(Op, DAG);
+
+    case AMDGPUIntrinsic::AMDGPU_clamp:
+    case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
+      return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case Intrinsic::AMDGPU_div_scale: {
+      // 3rd parameter required to be a constant.
+      const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+      if (!Param)
+        return DAG.getUNDEF(VT);
+
+      // Translate to the operands expected by the machine instruction. The
+      // first parameter must be the same as the first instruction.
+      SDValue Numerator = Op.getOperand(1);
+      SDValue Denominator = Op.getOperand(2);
+
+      // Note this order is opposite of the machine instruction's operations,
+      // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
+      // intrinsic has the numerator as the first operand to match a normal
+      // division operation.
+
+      SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
+
+      return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
+                         Denominator, Numerator);
+    }
+
+    case Intrinsic::AMDGPU_div_fmas:
+      return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                         Op.getOperand(4));
+
+    case Intrinsic::AMDGPU_div_fixup:
+      return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case Intrinsic::AMDGPU_trig_preop:
+      return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2));
+
+    case Intrinsic::AMDGPU_rcp:
+      return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
+
+    case Intrinsic::AMDGPU_rsq:
+      return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
+      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+
+    case Intrinsic::AMDGPU_rsq_clamped:
+      if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        Type *Type = VT.getTypeForEVT(*DAG.getContext());
+        APFloat Max = APFloat::getLargest(Type->getFltSemantics());
+        APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
+
+        SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+        SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
+                                  DAG.getConstantFP(Max, DL, VT));
+        return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
+                           DAG.getConstantFP(Min, DL, VT));
+      } else {
+        return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
+      }
+
+    case Intrinsic::AMDGPU_ldexp:
+      return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1),
+                                                   Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_imax:
+      return DAG.getNode(ISD::SMAX, DL, VT, Op.getOperand(1),
+                                            Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_umax:
+      return DAG.getNode(ISD::UMAX, DL, VT, Op.getOperand(1),
+                                            Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_imin:
+      return DAG.getNode(ISD::SMIN, DL, VT, Op.getOperand(1),
+                                            Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_umin:
+      return DAG.getNode(ISD::UMIN, DL, VT, Op.getOperand(1),
+                                            Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_umul24:
+      return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_imul24:
+      return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_umad24:
+      return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_imad24:
+      return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_bfe_i32:
+      return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_bfe_u32:
+      return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_bfi:
+      return DAG.getNode(AMDGPUISD::BFI, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_bfm:
+      return DAG.getNode(AMDGPUISD::BFM, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_brev:
+      return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
+
+  case Intrinsic::AMDGPU_class:
+    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
+      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
+      return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
+    case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
+      return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
+  }
+}
+
+///IABS(a) = SMAX(sub(0, a), a)
+SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+                            Op.getOperand(1));
+
+  return DAG.getNode(ISD::SMAX, DL, VT, Neg, Op.getOperand(1));
+}
+
+/// Linear Interpolation
+/// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
+SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
+                                DAG.getConstantFP(1.0f, DL, MVT::f32),
+                                Op.getOperand(1));
+  SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
+                                                    Op.getOperand(3));
+  return DAG.getNode(ISD::FADD, DL, VT,
+      DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)),
+      OneSubAC);
+}
+
+/// \brief Generate Min/Max node
+SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
+                                                   EVT VT,
+                                                   SDValue LHS,
+                                                   SDValue RHS,
+                                                   SDValue True,
+                                                   SDValue False,
+                                                   SDValue CC,
+                                                   DAGCombinerInfo &DCI) const {
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return SDValue();
+
+  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+  switch (CCOpcode) {
+  case ISD::SETOEQ:
+  case ISD::SETONE:
+  case ISD::SETUNE:
+  case ISD::SETNE:
+  case ISD::SETUEQ:
+  case ISD::SETEQ:
+  case ISD::SETFALSE:
+  case ISD::SETFALSE2:
+  case ISD::SETTRUE:
+  case ISD::SETTRUE2:
+  case ISD::SETUO:
+  case ISD::SETO:
+    break;
+  case ISD::SETULE:
+  case ISD::SETULT: {
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
+    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+  }
+  case ISD::SETOLE:
+  case ISD::SETOLT:
+  case ISD::SETLE:
+  case ISD::SETLT: {
+    // Ordered. Assume ordered for undefined.
+
+    // Only do this after legalization to avoid interfering with other combines
+    // which might occur.
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+        !DCI.isCalledByLegalizer())
+      return SDValue();
+
+    // We need to permute the operands to get the correct NaN behavior. The
+    // selected operand is the second one based on the failing compare with NaN,
+    // so permute it based on the compare type the hardware uses.
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+  }
+  case ISD::SETUGE:
+  case ISD::SETUGT: {
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+  }
+  case ISD::SETGT:
+  case ISD::SETGE:
+  case ISD::SETOGE:
+  case ISD::SETOGT: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+        !DCI.isCalledByLegalizer())
+      return SDValue();
+
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
+  }
+  case ISD::SETCC_INVALID:
+    llvm_unreachable("Invalid setcc condcode!");
+  }
+  return SDValue();
+}
+
+// FIXME: Remove this when combines added to DAGCombiner.
+SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL,
+                                             EVT VT,
+                                             SDValue LHS,
+                                             SDValue RHS,
+                                             SDValue True,
+                                             SDValue False,
+                                             SDValue CC,
+                                             SelectionDAG &DAG) const {
+  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
+    return SDValue();
+
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+  switch (CCOpcode) {
+  case ISD::SETULE:
+  case ISD::SETULT: {
+    unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
+  }
+  case ISD::SETLE:
+  case ISD::SETLT: {
+    unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
+  }
+  case ISD::SETGT:
+  case ISD::SETGE: {
+    unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
+  }
+  case ISD::SETUGE:
+  case ISD::SETUGT: {
+    unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
+  }
+  default:
+    return SDValue();
+  }
+}
+
+SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  EVT MemVT = Load->getMemoryVT();
+  EVT MemEltVT = MemVT.getVectorElementType();
+
+  EVT LoadVT = Op.getValueType();
+  EVT EltVT = LoadVT.getVectorElementType();
+  EVT PtrVT = Load->getBasePtr().getValueType();
+
+  unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
+  SmallVector<SDValue, 8> Loads;
+  SmallVector<SDValue, 8> Chains;
+
+  SDLoc SL(Op);
+  unsigned MemEltSize = MemEltVT.getStoreSize();
+  MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
+                              DAG.getConstant(i * MemEltSize, SL, PtrVT));
+
+    SDValue NewLoad
+      = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
+                       Load->getChain(), Ptr,
+                       SrcValue.getWithOffset(i * MemEltSize),
+                       MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
+                       Load->isInvariant(), Load->getAlignment());
+    Loads.push_back(NewLoad.getValue(0));
+    Chains.push_back(NewLoad.getValue(1));
+  }
+
+  SDValue Ops[] = {
+    DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
+    DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
+  };
+
+  return DAG.getMergeValues(Ops, SL);
+}
+
+SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  // If this is a 2 element vector, we really want to scalarize and not create
+  // weird 1 element vectors.
+  if (VT.getVectorNumElements() == 2)
+    return ScalarizeVectorLoad(Op, DAG);
+
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  SDValue BasePtr = Load->getBasePtr();
+  EVT PtrVT = BasePtr.getValueType();
+  EVT MemVT = Load->getMemoryVT();
+  SDLoc SL(Op);
+  MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
+
+  EVT LoVT, HiVT;
+  EVT LoMemVT, HiMemVT;
+  SDValue Lo, Hi;
+
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
+  std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
+  SDValue LoLoad
+    = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
+                     Load->getChain(), BasePtr,
+                     SrcValue,
+                     LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
+                     Load->isInvariant(), Load->getAlignment());
+
+  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                              DAG.getConstant(LoMemVT.getStoreSize(), SL,
+                                              PtrVT));
+
+  SDValue HiLoad
+    = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
+                     Load->getChain(), HiPtr,
+                     SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+                     HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
+                     Load->isInvariant(), Load->getAlignment());
+
+  SDValue Ops[] = {
+    DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
+    DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+                LoLoad.getValue(1), HiLoad.getValue(1))
+  };
+
+  return DAG.getMergeValues(Ops, SL);
+}
+
+SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
+                                               SelectionDAG &DAG) const {
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT MemVT = Store->getMemoryVT();
+  unsigned MemBits = MemVT.getSizeInBits();
+
+  // Byte stores are really expensive, so if possible, try to pack 32-bit vector
+  // truncating store into an i32 store.
+  // XXX: We could also handle optimize other vector bitwidths.
+  if (!MemVT.isVector() || MemBits > 32) {
+    return SDValue();
+  }
+
+  SDLoc DL(Op);
+  SDValue Value = Store->getValue();
+  EVT VT = Value.getValueType();
+  EVT ElemVT = VT.getVectorElementType();
+  SDValue Ptr = Store->getBasePtr();
+  EVT MemEltVT = MemVT.getVectorElementType();
+  unsigned MemEltBits = MemEltVT.getSizeInBits();
+  unsigned MemNumElements = MemVT.getVectorNumElements();
+  unsigned PackedSize = MemVT.getStoreSizeInBits();
+  SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, DL, MVT::i32);
+
+  assert(Value.getValueType().getScalarSizeInBits() >= 32);
+
+  SDValue PackedValue;
+  for (unsigned i = 0; i < MemNumElements; ++i) {
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
+                              DAG.getConstant(i, DL, MVT::i32));
+    Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
+    Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
+
+    SDValue Shift = DAG.getConstant(MemEltBits * i, DL, MVT::i32);
+    Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
+
+    if (i == 0) {
+      PackedValue = Elt;
+    } else {
+      PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
+    }
+  }
+
+  if (PackedSize < 32) {
+    EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
+    return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
+                             Store->getMemOperand()->getPointerInfo(),
+                             PackedVT,
+                             Store->isNonTemporal(), Store->isVolatile(),
+                             Store->getAlignment());
+  }
+
+  return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
+                      Store->getMemOperand()->getPointerInfo(),
+                      Store->isVolatile(),  Store->isNonTemporal(),
+                      Store->getAlignment());
+}
+
+SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
+  EVT EltVT = Store->getValue().getValueType().getVectorElementType();
+  EVT PtrVT = Store->getBasePtr().getValueType();
+  unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
+  SDLoc SL(Op);
+
+  SmallVector<SDValue, 8> Chains;
+
+  unsigned EltSize = MemEltVT.getStoreSize();
+  MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
+
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                              Store->getValue(),
+                              DAG.getConstant(i, SL, MVT::i32));
+
+    SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), SL, PtrVT);
+    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset);
+    SDValue NewStore =
+      DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
+                        SrcValue.getWithOffset(i * EltSize),
+                        MemEltVT, Store->isNonTemporal(), Store->isVolatile(),
+                        Store->getAlignment());
+    Chains.push_back(NewStore);
+  }
+
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
+}
+
+SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  SDValue Val = Store->getValue();
+  EVT VT = Val.getValueType();
+
+  // If this is a 2 element vector, we really want to scalarize and not create
+  // weird 1 element vectors.
+  if (VT.getVectorNumElements() == 2)
+    return ScalarizeVectorStore(Op, DAG);
+
+  EVT MemVT = Store->getMemoryVT();
+  SDValue Chain = Store->getChain();
+  SDValue BasePtr = Store->getBasePtr();
+  SDLoc SL(Op);
+
+  EVT LoVT, HiVT;
+  EVT LoMemVT, HiMemVT;
+  SDValue Lo, Hi;
+
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
+  std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
+
+  EVT PtrVT = BasePtr.getValueType();
+  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                              DAG.getConstant(LoMemVT.getStoreSize(), SL,
+                                              PtrVT));
+
+  MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
+  SDValue LoStore
+    = DAG.getTruncStore(Chain, SL, Lo,
+                        BasePtr,
+                        SrcValue,
+                        LoMemVT,
+                        Store->isNonTemporal(),
+                        Store->isVolatile(),
+                        Store->getAlignment());
+  SDValue HiStore
+    = DAG.getTruncStore(Chain, SL, Hi,
+                        HiPtr,
+                        SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+                        HiMemVT,
+                        Store->isNonTemporal(),
+                        Store->isVolatile(),
+                        Store->getAlignment());
+
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
+}
+
+
+SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  ISD::LoadExtType ExtType = Load->getExtensionType();
+  EVT VT = Op.getValueType();
+  EVT MemVT = Load->getMemoryVT();
+
+  if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
+    assert(VT == MVT::i1 && "Only i1 non-extloads expected");
+    // FIXME: Copied from PPC
+    // First, load into 32 bits, then truncate to 1 bit.
+
+    SDValue Chain = Load->getChain();
+    SDValue BasePtr = Load->getBasePtr();
+    MachineMemOperand *MMO = Load->getMemOperand();
+
+    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
+                                   BasePtr, MVT::i8, MMO);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
+      NewLD.getValue(1)
+    };
+
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+      Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
+      ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
+    return SDValue();
+
+  // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
+  // register (2-)byte extract.
+
+  // Get Register holding the target.
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
+                            DAG.getConstant(2, DL, MVT::i32));
+  // Load the Register.
+  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+                            Load->getChain(), Ptr,
+                            DAG.getTargetConstant(0, DL, MVT::i32),
+                            Op.getOperand(2));
+
+  // Get offset within the register.
+  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
+                                Load->getBasePtr(),
+                                DAG.getConstant(0x3, DL, MVT::i32));
+
+  // Bit offset of target byte (byteIdx * 8).
+  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                 DAG.getConstant(3, DL, MVT::i32));
+
+  // Shift to the right.
+  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
+
+  // Eliminate the upper bits by setting them to ...
+  EVT MemEltVT = MemVT.getScalarType();
+
+  // ... ones.
+  if (ExtType == ISD::SEXTLOAD) {
+    SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
+      Load->getChain()
+    };
+
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  // ... or zeros.
+  SDValue Ops[] = {
+    DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
+    Load->getChain()
+  };
+
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
+  if (Result.getNode()) {
+    return Result;
+  }
+
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  SDValue Chain = Store->getChain();
+  if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+       Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+      Store->getValue().getValueType().isVector()) {
+    return ScalarizeVectorStore(Op, DAG);
+  }
+
+  EVT MemVT = Store->getMemoryVT();
+  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
+      MemVT.bitsLT(MVT::i32)) {
+    unsigned Mask = 0;
+    if (Store->getMemoryVT() == MVT::i8) {
+      Mask = 0xff;
+    } else if (Store->getMemoryVT() == MVT::i16) {
+      Mask = 0xffff;
+    }
+    SDValue BasePtr = Store->getBasePtr();
+    SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
+                              DAG.getConstant(2, DL, MVT::i32));
+    SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+                              Chain, Ptr,
+                              DAG.getTargetConstant(0, DL, MVT::i32));
+
+    SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
+                                  DAG.getConstant(0x3, DL, MVT::i32));
+
+    SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                   DAG.getConstant(3, DL, MVT::i32));
+
+    SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
+                                    Store->getValue());
+
+    SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
+
+    SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                                       MaskedValue, ShiftAmt);
+
+    SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                                  DAG.getConstant(Mask, DL, MVT::i32),
+                                  ShiftAmt);
+    DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
+                          DAG.getConstant(0xffffffff, DL, MVT::i32));
+    Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+
+    SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
+    return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                       Chain, Value, Ptr,
+                       DAG.getTargetConstant(0, DL, MVT::i32));
+  }
+  return SDValue();
+}
+
+// This is a shortcut for integer division because we have fast i32<->f32
+// conversions, and fast f32 reciprocal instructions. The fractional part of a
+// float is enough to accurately represent up to a 24-bit integer.
+SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  MVT IntVT = MVT::i32;
+  MVT FltVT = MVT::f32;
+
+  ISD::NodeType ToFp  = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
+  ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
+
+  if (VT.isVector()) {
+    unsigned NElts = VT.getVectorNumElements();
+    IntVT = MVT::getVectorVT(MVT::i32, NElts);
+    FltVT = MVT::getVectorVT(MVT::f32, NElts);
+  }
+
+  unsigned BitSize = VT.getScalarType().getSizeInBits();
+
+  SDValue jq = DAG.getConstant(1, DL, IntVT);
+
+  if (sign) {
+    // char|short jq = ia ^ ib;
+    jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
+
+    // jq = jq >> (bitsize - 2)
+    jq = DAG.getNode(ISD::SRA, DL, VT, jq,
+                     DAG.getConstant(BitSize - 2, DL, VT));
+
+    // jq = jq | 0x1
+    jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
+
+    // jq = (int)jq
+    jq = DAG.getSExtOrTrunc(jq, DL, IntVT);
+  }
+
+  // int ia = (int)LHS;
+  SDValue ia = sign ?
+    DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT);
+
+  // int ib, (int)RHS;
+  SDValue ib = sign ?
+    DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT);
+
+  // float fa = (float)ia;
+  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
+
+  // float fb = (float)ib;
+  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
+
+  // float fq = native_divide(fa, fb);
+  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
+                           fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
+
+  // fq = trunc(fq);
+  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
+
+  // float fqneg = -fq;
+  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
+
+  // float fr = mad(fqneg, fb, fa);
+  SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT,
+                           DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa);
+
+  // int iq = (int)fq;
+  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
+
+  // fr = fabs(fr);
+  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
+
+  // fb = fabs(fb);
+  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT);
+
+  // int cv = fr >= fb;
+  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
+
+  // jq = (cv ? jq : 0);
+  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
+
+  // dst = trunc/extend to legal type
+  iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT);
+
+  // dst = iq + jq;
+  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
+
+  // Rem needs compensation, it's easier to recompute it
+  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
+  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
+
+  SDValue Res[2] = {
+    Div,
+    Rem
+  };
+  return DAG.getMergeValues(Res, DL);
+}
+
+void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
+                                      SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &Results) const {
+  assert(Op.getValueType() == MVT::i64);
+
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+  SDValue one = DAG.getConstant(1, DL, HalfVT);
+  SDValue zero = DAG.getConstant(0, DL, HalfVT);
+
+  //HiLo split
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
+  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
+
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
+  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
+
+  if (VT == MVT::i64 &&
+    DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
+    DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
+
+    SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
+                              LHS_Lo, RHS_Lo);
+
+    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero);
+    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero);
+    Results.push_back(DIV);
+    Results.push_back(REM);
+    return;
+  }
+
+  // Get Speculative values
+  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
+  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
+
+  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
+  SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero);
+
+  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
+  SDValue DIV_Lo = zero;
+
+  const unsigned halfBitWidth = HalfVT.getSizeInBits();
+
+  for (unsigned i = 0; i < halfBitWidth; ++i) {
+    const unsigned bitPos = halfBitWidth - i - 1;
+    SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
+    // Get value of high bit
+    SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
+    HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
+    HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
+
+    // Shift
+    REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
+    // Add LHS high bit
+    REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
+
+    SDValue BIT = DAG.getConstant(1 << bitPos, DL, HalfVT);
+    SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
+
+    DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
+
+    // Update REM
+    SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
+    REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
+  }
+
+  SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
+  Results.push_back(DIV);
+  Results.push_back(REM);
+}
+
+SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::i64) {
+    SmallVector<SDValue, 2> Results;
+    LowerUDIVREM64(Op, DAG, Results);
+    return DAG.getMergeValues(Results, DL);
+  }
+
+  SDValue Num = Op.getOperand(0);
+  SDValue Den = Op.getOperand(1);
+
+  if (VT == MVT::i32) {
+    if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) &&
+        DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) {
+      // TODO: We technically could do this for i64, but shouldn't that just be
+      // handled by something generally reducing 64-bit division on 32-bit
+      // values to 32-bit?
+      return LowerDIVREM24(Op, DAG, false);
+    }
+  }
+
+  // RCP =  URECIP(Den) = 2^32 / Den + e
+  // e is rounding error.
+  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
+
+  // RCP_LO = mul(RCP, Den) */
+  SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
+
+  // RCP_HI = mulhu (RCP, Den) */
+  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
+
+  // NEG_RCP_LO = -RCP_LO
+  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+                                                     RCP_LO);
+
+  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
+  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
+                                           NEG_RCP_LO, RCP_LO,
+                                           ISD::SETEQ);
+  // Calculate the rounding error from the URECIP instruction
+  // E = mulhu(ABS_RCP_LO, RCP)
+  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
+
+  // RCP_A_E = RCP + E
+  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
+
+  // RCP_S_E = RCP - E
+  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
+
+  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
+  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
+                                     RCP_A_E, RCP_S_E,
+                                     ISD::SETEQ);
+  // Quotient = mulhu(Tmp0, Num)
+  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
+
+  // Num_S_Remainder = Quotient * Den
+  SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
+
+  // Remainder = Num - Num_S_Remainder
+  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
+
+  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
+  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
+                                                 DAG.getConstant(-1, DL, VT),
+                                                 DAG.getConstant(0, DL, VT),
+                                                 ISD::SETUGE);
+  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
+  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
+                                                  Num_S_Remainder,
+                                                  DAG.getConstant(-1, DL, VT),
+                                                  DAG.getConstant(0, DL, VT),
+                                                  ISD::SETUGE);
+  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
+  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
+                                               Remainder_GE_Zero);
+
+  // Calculate Division result:
+
+  // Quotient_A_One = Quotient + 1
+  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
+                                       DAG.getConstant(1, DL, VT));
+
+  // Quotient_S_One = Quotient - 1
+  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
+                                       DAG.getConstant(1, DL, VT));
+
+  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
+  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
+                                     Quotient, Quotient_A_One, ISD::SETEQ);
+
+  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
+  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
+                            Quotient_S_One, Div, ISD::SETEQ);
+
+  // Calculate Rem result:
+
+  // Remainder_S_Den = Remainder - Den
+  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
+
+  // Remainder_A_Den = Remainder + Den
+  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
+
+  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
+  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
+                                    Remainder, Remainder_S_Den, ISD::SETEQ);
+
+  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
+  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
+                            Remainder_A_Den, Rem, ISD::SETEQ);
+  SDValue Ops[2] = {
+    Div,
+    Rem
+  };
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue NegOne = DAG.getConstant(-1, DL, VT);
+
+  if (VT == MVT::i32 &&
+      DAG.ComputeNumSignBits(LHS) > 8 &&
+      DAG.ComputeNumSignBits(RHS) > 8) {
+    return LowerDIVREM24(Op, DAG, true);
+  }
+  if (VT == MVT::i64 &&
+      DAG.ComputeNumSignBits(LHS) > 32 &&
+      DAG.ComputeNumSignBits(RHS) > 32) {
+    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+    //HiLo split
+    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
+    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
+    SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
+                                 LHS_Lo, RHS_Lo);
+    SDValue Res[2] = {
+      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
+      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
+    };
+    return DAG.getMergeValues(Res, DL);
+  }
+
+  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
+  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
+  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
+  SDValue RSign = LHSign; // Remainder sign is the same as LHS
+
+  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
+  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
+
+  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
+  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
+
+  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
+  SDValue Rem = Div.getValue(1);
+
+  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
+  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
+
+  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
+  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
+
+  SDValue Res[2] = {
+    Div,
+    Rem
+  };
+  return DAG.getMergeValues(Res, DL);
+}
+
+// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
+SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  EVT VT = Op.getValueType();
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+
+  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
+  SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
+
+  return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
+}
+
+SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  // result = trunc(src)
+  // if (src > 0.0 && src != result)
+  //   result += 1.0
+
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
+  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+
+  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
+  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
+  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
+
+  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
+  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
+}
+
+static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) {
+  const unsigned FractBits = 52;
+  const unsigned ExpBits = 11;
+
+  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+                                Hi,
+                                DAG.getConstant(FractBits - 32, SL, MVT::i32),
+                                DAG.getConstant(ExpBits, SL, MVT::i32));
+  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
+                            DAG.getConstant(1023, SL, MVT::i32));
+
+  return Exp;
+}
+
+SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  assert(Op.getValueType() == MVT::f64);
+
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+  // Extract the upper half, since this is where we will find the sign and
+  // exponent.
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
+
+  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
+
+  const unsigned FractBits = 52;
+
+  // Extract the sign bit.
+  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
+  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
+
+  // Extend back to to 64-bits.
+  SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                                  Zero, SignBit);
+  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
+
+  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
+  const SDValue FractMask
+    = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
+
+  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
+  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
+  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
+
+  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
+
+  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+
+  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
+  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
+
+  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
+}
+
+SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  assert(Op.getValueType() == MVT::f64);
+
+  APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52");
+  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
+  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
+
+  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
+  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
+
+  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
+
+  APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
+  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
+
+  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
+}
+
+SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
+  // FNEARBYINT and FRINT are the same, except in their handling of FP
+  // exceptions. Those aren't really meaningful for us, and OpenCL only has
+  // rint, so just treat them as equivalent.
+  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
+}
+
+// XXX - May require not supporting f32 denormals?
+SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+
+  SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
+
+  SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
+
+  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32);
+  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+  const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32);
+
+  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+
+  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
+
+  SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
+
+  return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+
+  SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
+
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+  const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
+  const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
+
+
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
+
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
+
+  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
+
+  const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
+                                       MVT::i64);
+
+  SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
+  SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
+                          DAG.getConstant(INT64_C(0x0008000000000000), SL,
+                                          MVT::i64),
+                          Exp);
+
+  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
+  SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
+                              DAG.getConstant(0, SL, MVT::i64), Tmp0,
+                              ISD::SETNE);
+
+  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
+                             D, DAG.getConstant(0, SL, MVT::i64));
+  SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
+
+  K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
+  K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
+
+  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+  SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
+
+  SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
+                            ExpEqNegOne,
+                            DAG.getConstantFP(1.0, SL, MVT::f64),
+                            DAG.getConstantFP(0.0, SL, MVT::f64));
+
+  SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
+
+  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
+  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
+
+  return K;
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::f32)
+    return LowerFROUND32(Op, DAG);
+
+  if (VT == MVT::f64)
+    return LowerFROUND64(Op, DAG);
+
+  llvm_unreachable("unhandled type");
+}
+
+SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  // result = trunc(src);
+  // if (src < 0.0 && src != result)
+  //   result += -1.0.
+
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
+  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+
+  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
+  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
+  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
+
+  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
+  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
+}
+
+SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
+                                               bool Signed) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
+                           DAG.getConstant(0, SL, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
+                           DAG.getConstant(1, SL, MVT::i32));
+
+  SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
+                              SL, MVT::f64, Hi);
+
+  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
+
+  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
+                              DAG.getConstant(32, SL, MVT::i32));
+
+  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
+}
+
+SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue S0 = Op.getOperand(0);
+  if (S0.getValueType() != MVT::i64)
+    return SDValue();
+
+  EVT DestVT = Op.getValueType();
+  if (DestVT == MVT::f64)
+    return LowerINT_TO_FP64(Op, DAG, false);
+
+  assert(DestVT == MVT::f32);
+
+  SDLoc DL(Op);
+
+  // f32 uint_to_fp i64
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
+                           DAG.getConstant(0, DL, MVT::i32));
+  SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
+                           DAG.getConstant(1, DL, MVT::i32));
+  SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
+  FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
+                        DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32
+  return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
+}
+
+SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(0);
+  if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64)
+    return LowerINT_TO_FP64(Op, DAG, true);
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
+                                               bool Signed) const {
+  SDLoc SL(Op);
+
+  SDValue Src = Op.getOperand(0);
+
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+  SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
+                                 MVT::f64);
+  SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
+                                 MVT::f64);
+
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
+
+  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
+
+
+  SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
+
+  SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
+                           MVT::i32, FloorMul);
+  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
+
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi);
+
+  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
+}
+
+SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(0);
+
+  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
+    return LowerFP64_TO_INT(Op, DAG, true);
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(0);
+
+  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
+    return LowerFP64_TO_INT(Op, DAG, false);
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+  MVT VT = Op.getSimpleValueType();
+  MVT ScalarVT = VT.getScalarType();
+
+  if (!VT.isVector())
+    return SDValue();
+
+  SDValue Src = Op.getOperand(0);
+  SDLoc DL(Op);
+
+  // TODO: Don't scalarize on Evergreen?
+  unsigned NElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 8> Args;
+  DAG.ExtractVectorElements(Src, Args, 0, NElts);
+
+  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
+  for (unsigned I = 0; I < NElts; ++I)
+    Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
+
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
+
+static bool isU24(SDValue Op, SelectionDAG &DAG) {
+  APInt KnownZero, KnownOne;
+  EVT VT = Op.getValueType();
+  DAG.computeKnownBits(Op, KnownZero, KnownOne);
+
+  return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
+}
+
+static bool isI24(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  // In order for this to be a signed 24-bit value, bit 23, must
+  // be a sign bit.
+  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
+                                     // as unsigned 24-bit values.
+         (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
+}
+
+static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
+
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = Op.getValueType();
+
+  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
+  APInt KnownZero, KnownOne;
+  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
+  if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
+    DCI.CommitTargetLoweringOpt(TLO);
+}
+
+template <typename IntTy>
+static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
+                               uint32_t Offset, uint32_t Width, SDLoc DL) {
+  if (Width + Offset < 32) {
+    uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
+    IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
+    return DAG.getConstant(Result, DL, MVT::i32);
+  }
+
+  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
+}
+
+static bool usesAllNormalStores(SDNode *LoadVal) {
+  for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) {
+    if (!ISD::isNormalStore(*I))
+      return false;
+  }
+
+  return true;
+}
+
+// If we have a copy of an illegal type, replace it with a load / store of an
+// equivalently sized legal type. This avoids intermediate bit pack / unpack
+// instructions emitted when handling extloads and truncstores. Ideally we could
+// recognize the pack / unpack pattern to eliminate it.
+SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  StoreSDNode *SN = cast<StoreSDNode>(N);
+  SDValue Value = SN->getValue();
+  EVT VT = Value.getValueType();
+
+  if (isTypeLegal(VT) || SN->isVolatile() ||
+      !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8)
+    return SDValue();
+
+  LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
+  if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal))
+    return SDValue();
+
+  EVT MemVT = LoadVal->getMemoryVT();
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT);
+
+  SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
+                                LoadVT, SL,
+                                LoadVal->getChain(),
+                                LoadVal->getBasePtr(),
+                                LoadVal->getOffset(),
+                                LoadVT,
+                                LoadVal->getMemOperand());
+
+  SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0));
+  DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false);
+
+  return DAG.getStore(SN->getChain(), SL, NewLoad,
+                      SN->getBasePtr(), SN->getMemOperand());
+}
+
+SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+
+  if (VT.isVector() || VT.getSizeInBits() > 32)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Mul;
+
+  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
+    N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+    N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+    Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
+  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
+    N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+    N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+    Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
+  } else {
+    return SDValue();
+  }
+
+  // We need to use sext even for MUL_U24, because MUL_U24 is used
+  // for signed multiply of 8 and 16-bit types.
+  return DAG.getSExtOrTrunc(Mul, DL, VT);
+}
+
+SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  switch(N->getOpcode()) {
+    default: break;
+    case ISD::MUL:
+      return performMulCombine(N, DCI);
+    case AMDGPUISD::MUL_I24:
+    case AMDGPUISD::MUL_U24: {
+      SDValue N0 = N->getOperand(0);
+      SDValue N1 = N->getOperand(1);
+      simplifyI24(N0, DCI);
+      simplifyI24(N1, DCI);
+      return SDValue();
+    }
+  case ISD::SELECT: {
+    SDValue Cond = N->getOperand(0);
+    if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) {
+      EVT VT = N->getValueType(0);
+      SDValue LHS = Cond.getOperand(0);
+      SDValue RHS = Cond.getOperand(1);
+      SDValue CC = Cond.getOperand(2);
+
+      SDValue True = N->getOperand(1);
+      SDValue False = N->getOperand(2);
+
+      if (VT == MVT::f32)
+        return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
+
+      // TODO: Implement min / max Evergreen instructions.
+      if (VT == MVT::i32 &&
+          Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+        return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
+      }
+    }
+
+    break;
+  }
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    assert(!N->getValueType(0).isVector() &&
+           "Vector handling of BFE not implemented");
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    uint32_t WidthVal = Width->getZExtValue() & 0x1f;
+    if (WidthVal == 0)
+      return DAG.getConstant(0, DL, MVT::i32);
+
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    SDValue BitsFrom = N->getOperand(0);
+    uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
+
+    bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
+
+    if (OffsetVal == 0) {
+      // This is already sign / zero extended, so try to fold away extra BFEs.
+      unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
+
+      unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
+      if (OpSignBits >= SignBits)
+        return BitsFrom;
+
+      EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
+      if (Signed) {
+        // This is a sign_extend_inreg. Replace it to take advantage of existing
+        // DAG Combines. If not eliminated, we will match back to BFE during
+        // selection.
+
+        // TODO: The sext_inreg of extended types ends, although we can could
+        // handle them in a single BFE.
+        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
+                           DAG.getValueType(SmallVT));
+      }
+
+      return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
+    }
+
+    if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
+      if (Signed) {
+        return constantFoldBFE<int32_t>(DAG,
+                                        CVal->getSExtValue(),
+                                        OffsetVal,
+                                        WidthVal,
+                                        DL);
+      }
+
+      return constantFoldBFE<uint32_t>(DAG,
+                                       CVal->getZExtValue(),
+                                       OffsetVal,
+                                       WidthVal,
+                                       DL);
+    }
+
+    if ((OffsetVal + WidthVal) >= 32) {
+      SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
+      return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
+                         BitsFrom, ShiftVal);
+    }
+
+    if (BitsFrom.hasOneUse()) {
+      APInt Demanded = APInt::getBitsSet(32,
+                                         OffsetVal,
+                                         OffsetVal + WidthVal);
+
+      APInt KnownZero, KnownOne;
+      TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                            !DCI.isBeforeLegalizeOps());
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+          TLI.SimplifyDemandedBits(BitsFrom, Demanded,
+                                   KnownZero, KnownOne, TLO)) {
+        DCI.CommitTargetLoweringOpt(TLO);
+      }
+    }
+
+    break;
+  }
+
+  case ISD::STORE:
+    return performStoreCombine(N, DCI);
+  }
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+void AMDGPUTargetLowering::getOriginalFunctionArgs(
+                               SelectionDAG &DAG,
+                               const Function *F,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SmallVectorImpl<ISD::InputArg> &OrigIns) const {
+
+  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
+    if (Ins[i].ArgVT == Ins[i].VT) {
+      OrigIns.push_back(Ins[i]);
+      continue;
+    }
+
+    EVT VT;
+    if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
+      // Vector has been split into scalars.
+      VT = Ins[i].ArgVT.getVectorElementType();
+    } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
+               Ins[i].ArgVT.getVectorElementType() !=
+               Ins[i].VT.getVectorElementType()) {
+      // Vector elements have been promoted
+      VT = Ins[i].ArgVT;
+    } else {
+      // Vector has been spilt into smaller vectors.
+      VT = Ins[i].VT;
+    }
+
+    ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
+                      Ins[i].OrigArgIndex, Ins[i].PartOffset);
+    OrigIns.push_back(Arg);
+  }
+}
+
+bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->isExactlyValue(1.0);
+  }
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    return C->isAllOnesValue();
+  }
+  return false;
+}
+
+bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->getValueAPF().isZero();
+  }
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    return C->isNullValue();
+  }
+  return false;
+}
+
+SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
+                                                  const TargetRegisterClass *RC,
+                                                   unsigned Reg, EVT VT) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned VirtualRegister;
+  if (!MRI.isLiveIn(Reg)) {
+    VirtualRegister = MRI.createVirtualRegister(RC);
+    MRI.addLiveIn(Reg, VirtualRegister);
+  } else {
+    VirtualRegister = MRI.getLiveInVirtReg(Reg);
+  }
+  return DAG.getRegister(VirtualRegister, VT);
+}
+
+#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
+
+const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((AMDGPUISD::NodeType)Opcode) {
+  case AMDGPUISD::FIRST_NUMBER: break;
+  // AMDIL DAG nodes
+  NODE_NAME_CASE(CALL);
+  NODE_NAME_CASE(UMUL);
+  NODE_NAME_CASE(RET_FLAG);
+  NODE_NAME_CASE(BRANCH_COND);
+
+  // AMDGPU DAG nodes
+  NODE_NAME_CASE(DWORDADDR)
+  NODE_NAME_CASE(FRACT)
+  NODE_NAME_CASE(CLAMP)
+  NODE_NAME_CASE(COS_HW)
+  NODE_NAME_CASE(SIN_HW)
+  NODE_NAME_CASE(FMAX_LEGACY)
+  NODE_NAME_CASE(FMIN_LEGACY)
+  NODE_NAME_CASE(FMAX3)
+  NODE_NAME_CASE(SMAX3)
+  NODE_NAME_CASE(UMAX3)
+  NODE_NAME_CASE(FMIN3)
+  NODE_NAME_CASE(SMIN3)
+  NODE_NAME_CASE(UMIN3)
+  NODE_NAME_CASE(URECIP)
+  NODE_NAME_CASE(DIV_SCALE)
+  NODE_NAME_CASE(DIV_FMAS)
+  NODE_NAME_CASE(DIV_FIXUP)
+  NODE_NAME_CASE(TRIG_PREOP)
+  NODE_NAME_CASE(RCP)
+  NODE_NAME_CASE(RSQ)
+  NODE_NAME_CASE(RSQ_LEGACY)
+  NODE_NAME_CASE(RSQ_CLAMPED)
+  NODE_NAME_CASE(LDEXP)
+  NODE_NAME_CASE(FP_CLASS)
+  NODE_NAME_CASE(DOT4)
+  NODE_NAME_CASE(CARRY)
+  NODE_NAME_CASE(BORROW)
+  NODE_NAME_CASE(BFE_U32)
+  NODE_NAME_CASE(BFE_I32)
+  NODE_NAME_CASE(BFI)
+  NODE_NAME_CASE(BFM)
+  NODE_NAME_CASE(BREV)
+  NODE_NAME_CASE(MUL_U24)
+  NODE_NAME_CASE(MUL_I24)
+  NODE_NAME_CASE(MAD_U24)
+  NODE_NAME_CASE(MAD_I24)
+  NODE_NAME_CASE(TEXTURE_FETCH)
+  NODE_NAME_CASE(EXPORT)
+  NODE_NAME_CASE(CONST_ADDRESS)
+  NODE_NAME_CASE(REGISTER_LOAD)
+  NODE_NAME_CASE(REGISTER_STORE)
+  NODE_NAME_CASE(LOAD_CONSTANT)
+  NODE_NAME_CASE(LOAD_INPUT)
+  NODE_NAME_CASE(SAMPLE)
+  NODE_NAME_CASE(SAMPLEB)
+  NODE_NAME_CASE(SAMPLED)
+  NODE_NAME_CASE(SAMPLEL)
+  NODE_NAME_CASE(CVT_F32_UBYTE0)
+  NODE_NAME_CASE(CVT_F32_UBYTE1)
+  NODE_NAME_CASE(CVT_F32_UBYTE2)
+  NODE_NAME_CASE(CVT_F32_UBYTE3)
+  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
+  NODE_NAME_CASE(CONST_DATA_PTR)
+  case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
+  NODE_NAME_CASE(SENDMSG)
+  NODE_NAME_CASE(INTERP_MOV)
+  NODE_NAME_CASE(INTERP_P1)
+  NODE_NAME_CASE(INTERP_P2)
+  NODE_NAME_CASE(STORE_MSKOR)
+  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+  case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
+  }
+  return nullptr;
+}
+
+SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
+                                               DAGCombinerInfo &DCI,
+                                               unsigned &RefinementSteps,
+                                               bool &UseOneConstNR) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = Operand.getValueType();
+
+  if (VT == MVT::f32) {
+    RefinementSteps = 0;
+    return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
+  }
+
+  // TODO: There is also f64 rsq instruction, but the documentation is less
+  // clear on its precision.
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
+                                               DAGCombinerInfo &DCI,
+                                               unsigned &RefinementSteps) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = Operand.getValueType();
+
+  if (VT == MVT::f32) {
+    // Reciprocal, < 1 ulp error.
+    //
+    // This reciprocal approximation converges to < 0.5 ulp error with one
+    // newton rhapson performed with two fused multiple adds (FMAs).
+
+    RefinementSteps = 0;
+    return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
+  }
+
+  // TODO: There is also f64 rcp instruction, but the documentation is less
+  // clear on its precision.
+
+  return SDValue();
+}
+
+static void computeKnownBitsForMinMax(const SDValue Op0,
+                                      const SDValue Op1,
+                                      APInt &KnownZero,
+                                      APInt &KnownOne,
+                                      const SelectionDAG &DAG,
+                                      unsigned Depth) {
+  APInt Op0Zero, Op0One;
+  APInt Op1Zero, Op1One;
+  DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
+  DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);
+
+  KnownZero = Op0Zero & Op1Zero;
+  KnownOne = Op0One & Op1One;
+}
+
+void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
+  const SDValue Op,
+  APInt &KnownZero,
+  APInt &KnownOne,
+  const SelectionDAG &DAG,
+  unsigned Depth) const {
+
+  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
+
+  APInt KnownZero2;
+  APInt KnownOne2;
+  unsigned Opc = Op.getOpcode();
+
+  switch (Opc) {
+  default:
+    break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    // FIXME: The intrinsic should just use the node.
+    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
+    case AMDGPUIntrinsic::AMDGPU_imax:
+    case AMDGPUIntrinsic::AMDGPU_umax:
+    case AMDGPUIntrinsic::AMDGPU_imin:
+    case AMDGPUIntrinsic::AMDGPU_umin:
+      computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
+                                KnownZero, KnownOne, DAG, Depth);
+      break;
+    default:
+      break;
+    }
+
+    break;
+  }
+  case AMDGPUISD::CARRY:
+  case AMDGPUISD::BORROW: {
+    KnownZero = APInt::getHighBitsSet(32, 31);
+    break;
+  }
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!CWidth)
+      return;
+
+    unsigned BitWidth = 32;
+    uint32_t Width = CWidth->getZExtValue() & 0x1f;
+
+    if (Opc == AMDGPUISD::BFE_U32)
+      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+
+    break;
+  }
+  }
+}
+
+unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
+  SDValue Op,
+  const SelectionDAG &DAG,
+  unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  case AMDGPUISD::BFE_I32: {
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!Width)
+      return 1;
+
+    unsigned SignBits = 32 - Width->getZExtValue() + 1;
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!Offset || !Offset->isNullValue())
+      return SignBits;
+
+    // TODO: Could probably figure something out with non-0 offsets.
+    unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    return std::max(SignBits, Op0SignBits);
+  }
+
+  case AMDGPUISD::BFE_U32: {
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
+  }
+
+  case AMDGPUISD::CARRY:
+  case AMDGPUISD::BORROW:
+    return 31;
+
+  default:
+    return 1;
+  }
+}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
new file mode 100644
index 00000000000..fbb7d3c8843
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -0,0 +1,307 @@
+//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition of the TargetLowering class that is common
+/// to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H
+#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class AMDGPUMachineFunction;
+class AMDGPUSubtarget;
+class MachineRegisterInfo;
+
+class AMDGPUTargetLowering : public TargetLowering {
+protected:
+  const AMDGPUSubtarget *Subtarget;
+
+private:
+  SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV,
+                                   const SDValue &InitPtr,
+                                   SDValue Chain,
+                                   SelectionDAG &DAG) const;
+  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  /// \brief Lower vector stores by merging the vector elements into an integer
+  /// of the same bitwidth.
+  SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
+  /// \brief Split a vector store into multiple scalar stores.
+  /// \returns The resulting chain.
+
+  SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+  SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+  SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+protected:
+  static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
+  static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT);
+
+  virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+                                     SelectionDAG &DAG) const;
+
+  /// \brief Split a vector load into a scalar load of each component.
+  SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+
+  /// \brief Split a vector load into 2 loads of half the vector.
+  SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+
+  /// \brief Split a vector store into a scalar store of each component.
+  SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const;
+
+  /// \brief Split a vector store into 2 stores of half the vector.
+  SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
+  void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &Results) const;
+  bool isHWTrueValue(SDValue Op) const;
+  bool isHWFalseValue(SDValue Op) const;
+
+  /// The SelectionDAGBuilder will automatically promote function arguments
+  /// with illegal types.  However, this does not work for the AMDGPU targets
+  /// since the function arguments are stored in memory as these illegal types.
+  /// In order to handle this properly we need to get the origianl types sizes
+  /// from the LLVM IR Function and fixup the ISD:InputArg values before
+  /// passing them to AnalyzeFormalArguments()
+  void getOriginalFunctionArgs(SelectionDAG &DAG,
+                               const Function *F,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SmallVectorImpl<ISD::InputArg> &OrigIns) const;
+  void AnalyzeFormalArguments(CCState &State,
+                              const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
+public:
+  AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
+
+  bool isFAbsFree(EVT VT) const override;
+  bool isFNegFree(EVT VT) const override;
+  bool isTruncateFree(EVT Src, EVT Dest) const override;
+  bool isTruncateFree(Type *Src, Type *Dest) const override;
+
+  bool isZExtFree(Type *Src, Type *Dest) const override;
+  bool isZExtFree(EVT Src, EVT Dest) const override;
+  bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+  bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+
+  MVT getVectorIdxTy() const override;
+  bool isSelectSupported(SelectSupportKind) const override;
+
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+  bool ShouldShrinkFPConstant(EVT VT) const override;
+  bool shouldReduceLoadWidth(SDNode *Load,
+                             ISD::LoadExtType ExtType,
+                             EVT ExtVT) const override;
+
+  bool isLoadBitCastBeneficial(EVT, EVT) const override;
+
+  bool storeOfVectorConstantIsCheap(EVT MemVT,
+                                    unsigned NumElem,
+                                    unsigned AS) const override;
+  bool isCheapToSpeculateCttz() const override;
+  bool isCheapToSpeculateCtlz() const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals,
+                      SDLoc DL, SelectionDAG &DAG) const override;
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  void ReplaceNodeResults(SDNode * N,
+                          SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+
+  SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue CombineFMinMaxLegacy(SDLoc DL,
+                               EVT VT,
+                               SDValue LHS,
+                               SDValue RHS,
+                               SDValue True,
+                               SDValue False,
+                               SDValue CC,
+                               DAGCombinerInfo &DCI) const;
+  SDValue CombineIMinMax(SDLoc DL,
+                         EVT VT,
+                         SDValue LHS,
+                         SDValue RHS,
+                         SDValue True,
+                         SDValue False,
+                         SDValue CC,
+                         SelectionDAG &DAG) const;
+
+  const char* getTargetNodeName(unsigned Opcode) const override;
+
+  SDValue getRsqrtEstimate(SDValue Operand,
+                           DAGCombinerInfo &DCI,
+                           unsigned &RefinementSteps,
+                           bool &UseOneConstNR) const override;
+  SDValue getRecipEstimate(SDValue Operand,
+                           DAGCombinerInfo &DCI,
+                           unsigned &RefinementSteps) const override;
+
+  virtual SDNode *PostISelFolding(MachineSDNode *N,
+                                  SelectionDAG &DAG) const {
+    return N;
+  }
+
+  /// \brief Determine which of the bits specified in \p Mask are known to be
+  /// either zero or one and return them in the \p KnownZero and \p KnownOne
+  /// bitsets.
+  void computeKnownBitsForTargetNode(const SDValue Op,
+                                     APInt &KnownZero,
+                                     APInt &KnownOne,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
+  unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG,
+                                           unsigned Depth = 0) const override;
+
+  /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
+  /// MachineFunction.
+  ///
+  /// \returns a RegisterSDNode representing Reg.
+  virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Reg, EVT VT) const;
+};
+
+namespace AMDGPUISD {
+
+enum NodeType : unsigned {
+  // AMDIL ISD Opcodes
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  CALL,        // Function call based on a single integer
+  UMUL,        // 32bit unsigned multiplication
+  RET_FLAG,
+  BRANCH_COND,
+  // End AMDIL ISD Opcodes
+  DWORDADDR,
+  FRACT,
+  CLAMP,
+
+  // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
+  // Denormals handled on some parts.
+  COS_HW,
+  SIN_HW,
+  FMAX_LEGACY,
+  FMIN_LEGACY,
+  FMAX3,
+  SMAX3,
+  UMAX3,
+  FMIN3,
+  SMIN3,
+  UMIN3,
+  URECIP,
+  DIV_SCALE,
+  DIV_FMAS,
+  DIV_FIXUP,
+  TRIG_PREOP, // 1 ULP max error for f64
+
+  // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
+  //            For f64, max error 2^29 ULP, handles denormals.
+  RCP,
+  RSQ,
+  RSQ_LEGACY,
+  RSQ_CLAMPED,
+  LDEXP,
+  FP_CLASS,
+  DOT4,
+  CARRY,
+  BORROW,
+  BFE_U32, // Extract range of bits with zero extension to 32-bits.
+  BFE_I32, // Extract range of bits with sign extension to 32-bits.
+  BFI, // (src0 & src1) | (~src0 & src2)
+  BFM, // Insert a range of bits into a 32-bit word.
+  BREV, // Reverse bits.
+  MUL_U24,
+  MUL_I24,
+  MAD_U24,
+  MAD_I24,
+  TEXTURE_FETCH,
+  EXPORT,
+  CONST_ADDRESS,
+  REGISTER_LOAD,
+  REGISTER_STORE,
+  LOAD_INPUT,
+  SAMPLE,
+  SAMPLEB,
+  SAMPLED,
+  SAMPLEL,
+
+  // These cvt_f32_ubyte* nodes need to remain consecutive and in order.
+  CVT_F32_UBYTE0,
+  CVT_F32_UBYTE1,
+  CVT_F32_UBYTE2,
+  CVT_F32_UBYTE3,
+  /// This node is for VLIW targets and it is used to represent a vector
+  /// that is stored in consecutive registers with the same channel.
+  /// For example:
+  ///   |X  |Y|Z|W|
+  /// T0|v.x| | | |
+  /// T1|v.y| | | |
+  /// T2|v.z| | | |
+  /// T3|v.w| | | |
+  BUILD_VERTICAL_VECTOR,
+  /// Pointer to the start of the shader's constant data.
+  CONST_DATA_PTR,
+  SENDMSG,
+  INTERP_MOV,
+  INTERP_P1,
+  INTERP_P2,
+  FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  STORE_MSKOR,
+  LOAD_CONSTANT,
+  TBUFFER_STORE_FORMAT,
+  LAST_AMDGPU_ISD_NUMBER
+};
+
+
+} // End namespace AMDGPUISD
+
+} // End namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
new file mode 100644
index 00000000000..15a3d543a68
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -0,0 +1,369 @@
+//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Implementation of the TargetInstrInfo class that is common to all
+/// AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRINFO_NAMED_OPS
+#define GET_INSTRMAP_INFO
+#include "AMDGPUGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void AMDGPUInstrInfo::anchor() {}
+
+AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st)
+    : AMDGPUGenInstrInfo(-1, -1), ST(st) {}
+
+const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                           unsigned &SrcReg, unsigned &DstReg,
+                                           unsigned &SubIdx) const {
+// TODO: Implement this function
+  return false;
+}
+
+unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+
+unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                                   int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+
+bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
+                                          const MachineMemOperand *&MMO,
+                                          int &FrameIndex) const {
+// TODO: Implement this function
+  return false;
+}
+unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI,
+                                                    int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI,
+                                           const MachineMemOperand *&MMO,
+                                           int &FrameIndex) const {
+// TODO: Implement this function
+  return false;
+}
+
+MachineInstr *
+AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      LiveVariables *LV) const {
+// TODO: Implement this function
+  return nullptr;
+}
+
+void
+AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned SrcReg, bool isKill,
+                                    int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const {
+  llvm_unreachable("Not Implemented");
+}
+
+void
+AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI,
+                                     unsigned DestReg, int FrameIndex,
+                                     const TargetRegisterClass *RC,
+                                     const TargetRegisterInfo *TRI) const {
+  llvm_unreachable("Not Implemented");
+}
+
+bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  int OffsetOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                               AMDGPU::OpName::addr);
+   // addr is a custom operand with multiple MI operands, and only the
+   // first MI operand is given a name.
+  int RegOpIdx = OffsetOpIdx + 1;
+  int ChanOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                             AMDGPU::OpName::chan);
+  if (isRegisterLoad(*MI)) {
+    int DstOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                              AMDGPU::OpName::dst);
+    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
+    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
+    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
+    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+      buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
+                    getIndirectAddrRegClass()->getRegister(Address));
+    } else {
+      buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
+                        Address, OffsetReg);
+    }
+  } else if (isRegisterStore(*MI)) {
+    int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                              AMDGPU::OpName::val);
+    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
+    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
+    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
+    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+      buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
+                    MI->getOperand(ValOpIdx).getReg());
+    } else {
+      buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(),
+                         calculateIndirectAddress(RegIndex, Channel),
+                         OffsetReg);
+    }
+  } else {
+    return false;
+  }
+
+  MBB->erase(MI);
+  return true;
+}
+
+MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
+// TODO: Implement this function
+  return nullptr;
+}
+MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const {
+  // TODO: Implement this function
+  return nullptr;
+}
+bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                           ArrayRef<unsigned> Ops) const {
+  // TODO: Implement this function
+  return false;
+}
+bool
+AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                                 unsigned Reg, bool UnfoldLoad,
+                                 bool UnfoldStore,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool
+AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                                    SmallVectorImpl<SDNode*> &NewNodes) const {
+  // TODO: Implement this function
+  return false;
+}
+
+unsigned
+AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                           bool UnfoldLoad, bool UnfoldStore,
+                                           unsigned *LoadRegIndex) const {
+  // TODO: Implement this function
+  return 0;
+}
+
+bool AMDGPUInstrInfo::enableClusterLoads() const {
+  return true;
+}
+
+// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
+// the first 16 loads will be interleaved with the stores, and the next 16 will
+// be clustered as expected. It should really split into 2 16 store batches.
+//
+// Loads are clustered until this returns false, rather than trying to schedule
+// groups of stores. This also means we have to deal with saying different
+// address space loads should be clustered, and ones which might cause bank
+// conflicts.
+//
+// This might be deprecated so it might not be worth that much effort to fix.
+bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
+                                              int64_t Offset0, int64_t Offset1,
+                                              unsigned NumLoads) const {
+  assert(Offset1 > Offset0 &&
+         "Second offset should be larger than first offset!");
+  // If we have less than 16 loads in a row, and the offsets are within 64
+  // bytes, then schedule together.
+
+  // A cacheline is 64 bytes (for global memory).
+  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
+}
+
+bool
+AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
+  const {
+  // TODO: Implement this function
+  return true;
+}
+void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const {
+  // TODO: Implement this function
+}
+
+bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool AMDGPUInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+                                        ArrayRef<MachineOperand> Pred2) const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                      std::vector<MachineOperand> &Pred) const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const {
+  // TODO: Implement this function
+  return MI->getDesc().isPredicable();
+}
+
+bool
+AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  // TODO: Implement this function
+  return true;
+}
+
+bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE;
+}
+
+bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
+}
+
+int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int Offset = -1;
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  if (MRI.livein_empty()) {
+    return 0;
+  }
+
+  const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
+  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
+                                            LE = MRI.livein_end();
+                                            LI != LE; ++LI) {
+    unsigned Reg = LI->first;
+    if (TargetRegisterInfo::isVirtualRegister(Reg) ||
+        !IndirectRC->contains(Reg))
+      continue;
+
+    unsigned RegIndex;
+    unsigned RegEnd;
+    for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
+                                                          ++RegIndex) {
+      if (IndirectRC->getRegister(RegIndex) == Reg)
+        break;
+    }
+    Offset = std::max(Offset, (int)RegIndex);
+  }
+
+  return Offset + 1;
+}
+
+int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
+  int Offset = 0;
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Variable sized objects are not supported
+  assert(!MFI->hasVarSizedObjects());
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1);
+
+  return getIndirectIndexBegin(MF) + Offset;
+}
+
+int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
+  switch (Channels) {
+  default: return Opcode;
+  case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1);
+  case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2);
+  case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
+  }
+}
+
+// Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
+// header files, so we need to wrap it in a function that takes unsigned
+// instead.
+namespace llvm {
+namespace AMDGPU {
+static int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+  return getMCOpcodeGen(Opcode, (enum Subtarget)Gen);
+}
+}
+}
+
+// This must be kept in sync with the SISubtarget class in SIInstrInfo.td
+enum SISubtarget {
+  SI = 0,
+  VI = 1
+};
+
+static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) {
+  switch (Gen) {
+  default:
+    return SI;
+  case AMDGPUSubtarget::VOLCANIC_ISLANDS:
+    return VI;
+  }
+}
+
+int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
+  int MCOp = AMDGPU::getMCOpcode(
+      Opcode, AMDGPUSubtargetToSISubtarget(ST.getGeneration()));
+
+  // -1 means that Opcode is already a native instruction.
+  if (MCOp == -1)
+    return Opcode;
+
+  // (uint16_t)-1 means that Opcode is a pseudo instruction that has
+  // no encoding in the given subtarget generation.
+  if (MCOp == (uint16_t)-1)
+    return -1;
+
+  return MCOp;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
new file mode 100644
index 00000000000..86d3962b385
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -0,0 +1,206 @@
+//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Contains the definition of a TargetInstrInfo class that is common
+/// to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H
+#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H
+
+#include "AMDGPURegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <map>
+
+#define GET_INSTRINFO_HEADER
+#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_OPERAND_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+
+#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT
+#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT
+#define OPCODE_IS_ZERO AMDGPU::PRED_SETE
+#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+class MachineFunction;
+class MachineInstr;
+class MachineInstrBuilder;
+
+class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
+private:
+  const AMDGPURegisterInfo RI;
+  virtual void anchor();
+protected:
+  const AMDGPUSubtarget &ST;
+public:
+  explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
+
+  virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
+
+  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+                             unsigned &DstReg, unsigned &SubIdx) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                     int &FrameIndex) const override;
+  bool hasLoadFromStackSlot(const MachineInstr *MI,
+                            const MachineMemOperand *&MMO,
+                            int &FrameIndex) const override;
+  unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+  bool hasStoreFromStackSlot(const MachineInstr *MI,
+                             const MachineMemOperand *&MMO,
+                             int &FrameIndex) const;
+
+  MachineInstr *
+  convertToThreeAddress(MachineFunction::iterator &MFI,
+                        MachineBasicBlock::iterator &MBBI,
+                        LiveVariables *LV) const override;
+
+
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+protected:
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                      ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
+                                      int FrameIndex) const override;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                      ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
+                                      MachineInstr *LoadMI) const override;
+
+public:
+  /// \returns the smallest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  int getIndirectIndexBegin(const MachineFunction &MF) const;
+
+  /// \returns the largest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  int getIndirectIndexEnd(const MachineFunction &MF) const;
+
+  bool canFoldMemoryOperand(const MachineInstr *MI,
+                            ArrayRef<unsigned> Ops) const override;
+  bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                        unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                        SmallVectorImpl<MachineInstr *> &NewMIs) const override;
+  bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                           SmallVectorImpl<SDNode *> &NewNodes) const override;
+  unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+                               bool UnfoldLoad, bool UnfoldStore,
+                               unsigned *LoadRegIndex = nullptr) const override;
+
+  bool enableClusterLoads() const override;
+
+  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                               int64_t Offset1, int64_t Offset2,
+                               unsigned NumLoads) const override;
+
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
+  bool isPredicated(const MachineInstr *MI) const override;
+  bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+                         ArrayRef<MachineOperand> Pred2) const override;
+  bool DefinesPredicate(MachineInstr *MI,
+                        std::vector<MachineOperand> &Pred) const override;
+  bool isPredicable(MachineInstr *MI) const override;
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
+
+  // Helper functions that check the opcode for status information
+  bool isRegisterStore(const MachineInstr &MI) const;
+  bool isRegisterLoad(const MachineInstr &MI) const;
+
+  /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
+  /// Return -1 if the target-specific opcode for the pseudo instruction does
+  /// not exist. If Opcode is not a pseudo instruction, this is identity.
+  int pseudoToMCOpcode(int Opcode) const;
+
+  /// \brief Return the descriptor of the target-specific machine instruction
+  /// that corresponds to the specified pseudo or native opcode.
+  const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
+    return get(pseudoToMCOpcode(Opcode));
+  }
+
+//===---------------------------------------------------------------------===//
+// Pure virtual funtions to be implemented by sub-classes.
+//===---------------------------------------------------------------------===//
+
+  virtual bool isMov(unsigned opcode) const = 0;
+
+  /// \brief Calculate the "Indirect Address" for the given \p RegIndex and
+  ///        \p Channel
+  ///
+  /// We model indirect addressing using a virtual address space that can be
+  /// accesed with loads and stores.  The "Indirect Address" is the memory
+  /// address in this virtual address space that maps to the given \p RegIndex
+  /// and \p Channel.
+  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
+                                            unsigned Channel) const = 0;
+
+  /// \returns The register class to be used for loading and storing values
+  /// from an "Indirect Address" .
+  virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0;
+
+  /// \brief Build instruction(s) for an indirect register write.
+  ///
+  /// \returns The instruction that performs the indirect register write
+  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned ValueReg, unsigned Address,
+                                    unsigned OffsetReg) const = 0;
+
+  /// \brief Build instruction(s) for an indirect register read.
+  ///
+  /// \returns The instruction that performs the indirect register read
+  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned ValueReg, unsigned Address,
+                                    unsigned OffsetReg) const = 0;
+
+  /// \brief Build a MOV instruction.
+  virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator I,
+                                      unsigned DstReg, unsigned SrcReg) const = 0;
+
+  /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the
+  /// equivalent opcode that writes \p Channels Channels.
+  int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
+
+};
+
+namespace AMDGPU {
+  int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex);
+}  // End namespace AMDGPU
+
+} // End llvm namespace
+
+#define AMDGPU_FLAG_REGISTER_LOAD  (UINT64_C(1) << 63)
+#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62)
+
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
new file mode 100644
index 00000000000..b413897d9d2
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -0,0 +1,245 @@
+//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains DAG node defintions for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AMDGPU DAG Profiles
+//===----------------------------------------------------------------------===//
+
+def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
+]>;
+
+def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
+  [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
+>;
+
+def AMDGPULdExpOp : SDTypeProfile<1, 2,
+  [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
+>;
+
+def AMDGPUFPClassOp : SDTypeProfile<1, 2,
+  [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>]
+>;
+
+def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
+  [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
+>;
+
+// float, float, float, vcc
+def AMDGPUFmasOp : SDTypeProfile<1, 4,
+  [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>]
+>;
+
+//===----------------------------------------------------------------------===//
+// AMDGPU DAG Nodes
+//
+
+// This argument to this node is a dword address.
+def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
+
+def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
+def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
+
+// out = a - floor(a)
+def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
+
+// out = 1.0 / a
+def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a)
+def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a)
+def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a) result clamped to +/- max_float.
+def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>;
+
+def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
+
+def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
+
+// out = max(a, b) a and b are floats, where a nan comparison fails.
+// This is not commutative because this gives the second operand:
+//   x < nan ? x : nan -> nan
+//   nan < x ? nan : x -> x
+def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp,
+  []
+>;
+
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
+
+// out = max(a, b) a and b are signed ints
+def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = max(a, b) a and b are unsigned ints
+def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a and b are floats, where a nan comparison fails.
+def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
+  []
+>;
+
+// FIXME: TableGen doesn't like commutative instructions with more
+// than 2 operands.
+// out = max(a, b, c) a, b and c are floats
+def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = max(a, b, c) a, b, and c are signed ints
+def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = max(a, b, c) a, b and c are unsigned ints
+def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b, c) a, b and c are floats
+def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b, c) a, b and c are signed ints
+def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b) a and b are unsigned ints
+def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = (src0 + src1 > 0xFFFFFFFF) ? 1 : 0
+def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>;
+
+// out = (src1 > src0) ? 1 : 0
+def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>;
+
+
+def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
+  SDTIntToFPOp, []>;
+
+
+// urecip - This operation is a helper for integer division, it returns the
+// result of 1 / a as a fractional unsigned integer.
+// out = (2^32 / a) + e
+// e is rounding error
+def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
+
+// Special case divide preop and flags.
+def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
+
+//  Special case divide FMA with scale and flags (src0 = Quotient,
+//  src1 = Denominator, src2 = Numerator).
+def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>;
+
+// Single or double precision division fixup.
+// Special case divide fixup and flags(src0 = Quotient, src1 =
+// Denominator, src2 = Numerator).
+def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
+
+// Look Up 2.0 / pi src0 with segment select src1[4:0]
+def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
+
+def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
+                          SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
+                          [SDNPHasChain, SDNPMayLoad]>;
+
+def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE",
+                           SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
+                           [SDNPHasChain, SDNPMayStore]>;
+
+// MSKOR instructions are atomic memory instructions used mainly for storing
+// 8-bit and 16-bit values.  The definition is:
+//
+// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src)
+//
+// src0: vec4(src, 0, 0, mask)
+// src1: dst - rat offset (aka pointer) in dwords
+def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
+                        SDTypeProfile<0, 2, []>,
+                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def AMDGPUround : SDNode<"ISD::FROUND",
+                         SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
+
+def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
+
+def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>;
+
+// Signed and unsigned 24-bit mulitply.  The highest 8-bits are ignore when
+// performing the mulitply.  The result is a 32-bit value.
+def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
+  [SDNPCommutative]
+>;
+def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
+  [SDNPCommutative]
+>;
+
+def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
+  []
+>;
+def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
+  []
+>;
+
+def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
+                    SDTypeProfile<0, 1, [SDTCisInt<0>]>,
+                    [SDNPHasChain, SDNPInGlue]>;
+
+def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV",
+                        SDTypeProfile<1, 3, [SDTCisFP<0>]>,
+                        [SDNPInGlue]>;
+
+def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1",
+                      SDTypeProfile<1, 3, [SDTCisFP<0>]>,
+                      [SDNPInGlue, SDNPOutGlue]>;
+
+def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2",
+                      SDTypeProfile<1, 4, [SDTCisFP<0>]>,
+                      [SDNPInGlue]>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control Profile Types
+//===----------------------------------------------------------------------===//
+// Branch instruction where second and third are basic blocks
+def SDTIL_BRCond : SDTypeProfile<0, 2, [
+    SDTCisVT<0, OtherVT>
+    ]>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// Call/Return DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue]>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
new file mode 100644
index 00000000000..72cab39277c
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -0,0 +1,682 @@
+//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction defs that are common to all hw codegen
+// targets.
+//
+//===----------------------------------------------------------------------===//
+
+class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
+  field bit isRegisterLoad = 0;
+  field bit isRegisterStore = 0;
+
+  let Namespace = "AMDGPU";
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let Itinerary = NullALU;
+
+  let TSFlags{63} = isRegisterLoad;
+  let TSFlags{62} = isRegisterStore;
+}
+
+class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
+    : AMDGPUInst<outs, ins, asm, pattern> {
+
+  field bits<32> Inst = 0xffffffff;
+
+}
+
+def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">;
+def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">;
+def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
+
+def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
+def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
+
+let OperandType = "OPERAND_IMMEDIATE" in {
+
+def u32imm : Operand<i32> {
+  let PrintMethod = "printU32ImmOperand";
+}
+
+def u16imm : Operand<i16> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def u8imm : Operand<i8> {
+  let PrintMethod = "printU8ImmOperand";
+}
+
+} // End OperandType = "OPERAND_IMMEDIATE"
+
+//===--------------------------------------------------------------------===//
+// Custom Operands
+//===--------------------------------------------------------------------===//
+def brtarget   : Operand<OtherVT>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for floating-point comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_OEQ : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}]
+>;
+
+def COND_ONE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}]
+>;
+
+def COND_OGT : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}]
+>;
+
+def COND_OGE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}]
+>;
+
+def COND_OLT : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}]
+>;
+
+def COND_OLE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
+>;
+
+
+def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
+def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for unsigned / unordered comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>;
+def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>;
+def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>;
+def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>;
+def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>;
+def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>;
+
+// XXX - For some reason R600 version is preferring to use unordered
+// for setne?
+def COND_UNE_NE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
+>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for signed comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>;
+def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>;
+def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>;
+def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for integer equality
+//===----------------------------------------------------------------------===//
+
+def COND_EQ : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}]
+>;
+
+def COND_NE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}]
+>;
+
+def COND_NULL : PatLeaf <
+  (cond),
+  [{(void)N; return false;}]
+>;
+
+//===----------------------------------------------------------------------===//
+// Load/Store Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+}]>;
+
+class PrivateLoad <SDPatternOperator op> : PrivateMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
+class PrivateStore <SDPatternOperator op> : PrivateMemOp <
+  (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
+>;
+
+def load_private : PrivateLoad <load>;
+
+def truncstorei8_private : PrivateStore <truncstorei8>;
+def truncstorei16_private : PrivateStore <truncstorei16>;
+def store_private : PrivateStore <store>;
+
+def global_store : PatFrag<(ops node:$val, node:$ptr),
+    (store node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+// Global address space loads
+def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+// Constant address space loads
+def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
+                                              (ld_node node:$ptr), [{
+  LoadSDNode *L = cast<LoadSDNode>(N);
+  return L->getExtensionType() == ISD::ZEXTLOAD ||
+         L->getExtensionType() == ISD::EXTLOAD;
+}]>;
+
+def az_extload : AZExtLoadBase <unindexedload>;
+
+def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def extloadi8_private : PrivateLoad <az_extloadi8>;
+def sextloadi8_private : PrivateLoad <sextloadi8>;
+
+def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def az_extloadi16_global : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def extloadi16_private : PrivateLoad <az_extloadi16>;
+def sextloadi16_private : PrivateLoad <sextloadi16>;
+
+def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def az_extloadi32_global : PatFrag<(ops node:$ptr),
+                                   (az_extloadi32 node:$ptr), [{
+  return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi32_flat : PatFrag<(ops node:$ptr),
+                                   (az_extloadi32 node:$ptr), [{
+  return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi32_constant : PatFrag<(ops node:$ptr),
+                                     (az_extloadi32 node:$ptr), [{
+  return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei8 node:$val, node:$ptr), [{
+  return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei16 node:$val, node:$ptr), [{
+  return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei8 node:$val, node:$ptr), [{
+  return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei16 node:$val, node:$ptr), [{
+  return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def local_store : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei8 node:$val, node:$ptr), [{
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei16 node:$val, node:$ptr), [{
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
+    return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
+}]>;
+
+def local_load_aligned8bytes : Aligned8Bytes <
+  (ops node:$ptr), (local_load node:$ptr)
+>;
+
+def local_store_aligned8bytes : Aligned8Bytes <
+  (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr)
+>;
+
+class local_binary_atomic_op<SDNode atomic_op> :
+  PatFrag<(ops node:$ptr, node:$value),
+    (atomic_op node:$ptr, node:$value), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+
+def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
+def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>;
+def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>;
+def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>;
+def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>;
+def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>;
+def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>;
+def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>;
+def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>;
+def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>;
+def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
+
+def mskor_global : PatFrag<(ops node:$val, node:$ptr),
+                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}]>;
+
+multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
+
+  def _32_local : PatFrag <
+    (ops node:$ptr, node:$cmp, node:$swap),
+    (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
+      AtomicSDNode *AN = cast<AtomicSDNode>(N);
+      return AN->getMemoryVT() == MVT::i32 &&
+             AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  }]>;
+
+  def _64_local : PatFrag<
+    (ops node:$ptr, node:$cmp, node:$swap),
+    (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
+      AtomicSDNode *AN = cast<AtomicSDNode>(N);
+      return AN->getMemoryVT() == MVT::i64 &&
+             AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  }]>;
+}
+
+defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>;
+
+def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def flat_store : PatFrag<(ops node:$val, node:$ptr),
+                         (store node:$val, node:$ptr), [{
+  return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def mskor_flat : PatFrag<(ops node:$val, node:$ptr),
+                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
+}]>;
+
+class global_binary_atomic_op<SDNode atomic_op> : PatFrag<
+  (ops node:$ptr, node:$value),
+  (atomic_op node:$ptr, node:$value),
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]
+>;
+
+def atomic_swap_global : global_binary_atomic_op<atomic_swap>;
+def atomic_add_global : global_binary_atomic_op<atomic_load_add>;
+def atomic_and_global : global_binary_atomic_op<atomic_load_and>;
+def atomic_max_global : global_binary_atomic_op<atomic_load_max>;
+def atomic_min_global : global_binary_atomic_op<atomic_load_min>;
+def atomic_or_global : global_binary_atomic_op<atomic_load_or>;
+def atomic_sub_global : global_binary_atomic_op<atomic_load_sub>;
+def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
+def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
+def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
+
+//===----------------------------------------------------------------------===//
+// Misc Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+class Constants {
+int TWO_PI = 0x40c90fdb;
+int PI = 0x40490fdb;
+int TWO_PI_INV = 0x3e22f983;
+int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
+int FP32_NEG_ONE = 0xbf800000;
+int FP32_ONE = 0x3f800000;
+}
+def CONST : Constants;
+
+def FP_ZERO : PatLeaf <
+  (fpimm),
+  [{return N->getValueAPF().isZero();}]
+>;
+
+def FP_ONE : PatLeaf <
+  (fpimm),
+  [{return N->isExactlyValue(1.0);}]
+>;
+
+def FP_HALF : PatLeaf <
+  (fpimm),
+  [{return N->isExactlyValue(0.5);}]
+>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+let usesCustomInserter = 1  in {
+
+class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "CLAMP $dst, $src0",
+  [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+>;
+
+class FABS <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "FABS $dst, $src0",
+  [(set f32:$dst, (fabs f32:$src0))]
+>;
+
+class FNEG <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "FNEG $dst, $src0",
+  [(set f32:$dst, (fneg f32:$src0))]
+>;
+
+} // usesCustomInserter = 1
+
+multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
+                    ComplexPattern addrPat> {
+let UseNamedOperandTable = 1 in {
+
+  def RegisterLoad : AMDGPUShaderInst <
+    (outs dstClass:$dst),
+    (ins addrClass:$addr, i32imm:$chan),
+    "RegisterLoad $dst, $addr",
+    [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))]
+  > {
+    let isRegisterLoad = 1;
+  }
+
+  def RegisterStore : AMDGPUShaderInst <
+    (outs),
+    (ins dstClass:$val, addrClass:$addr, i32imm:$chan),
+    "RegisterStore $val, $addr",
+    [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))]
+  > {
+    let isRegisterStore = 1;
+  }
+}
+}
+
+} // End isCodeGenOnly = 1, isPseudo = 1
+
+/* Generic helper patterns for intrinsics */
+/* -------------------------------------- */
+
+class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul>
+  : Pat <
+  (fpow f32:$src0, f32:$src1),
+  (exp_ieee (mul f32:$src1, (log_ieee f32:$src0)))
+>;
+
+/* Other helper patterns */
+/* --------------------- */
+
+/* Extract element pattern */
+class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
+                       SubRegIndex sub_reg>
+  : Pat<
+  (sub_type (vector_extract vec_type:$src, sub_idx)),
+  (EXTRACT_SUBREG $src, sub_reg)
+>;
+
+/* Insert element pattern */
+class Insert_Element <ValueType elem_type, ValueType vec_type,
+                      int sub_idx, SubRegIndex sub_reg>
+  : Pat <
+  (vector_insert vec_type:$vec, elem_type:$elem, sub_idx),
+  (INSERT_SUBREG $vec, $elem, sub_reg)
+>;
+
+// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
+// can handle COPY instructions.
+// bitconvert pattern
+class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat <
+  (dt (bitconvert (st rc:$src0))),
+  (dt rc:$src0)
+>;
+
+// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
+// can handle COPY instructions.
+class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
+  (vt (AMDGPUdwordaddr (vt rc:$addr))),
+  (vt rc:$addr)
+>;
+
+// BFI_INT patterns
+
+multiclass BFIPatterns <Instruction BFI_INT,
+                        Instruction LoadImm32,
+                        RegisterClass RC64> {
+  // Definition from ISA doc:
+  // (y & x) | (z & ~x)
+  def : Pat <
+    (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
+    (BFI_INT $x, $y, $z)
+  >;
+
+  // SHA-256 Ch function
+  // z ^ (x & (y ^ z))
+  def : Pat <
+    (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
+    (BFI_INT $x, $y, $z)
+  >;
+
+  def : Pat <
+    (fcopysign f32:$src0, f32:$src1),
+    (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1)
+  >;
+
+  def : Pat <
+    (f64 (fcopysign f64:$src0, f64:$src1)),
+    (REG_SEQUENCE RC64,
+      (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+      (BFI_INT (LoadImm32 0x7fffffff),
+               (i32 (EXTRACT_SUBREG $src0, sub1)),
+               (i32 (EXTRACT_SUBREG $src1, sub1))), sub1)
+  >;
+}
+
+// SHA-256 Ma patterns
+
+// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
+class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
+  (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
+  (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
+>;
+
+// Bitfield extract patterns
+
+def IMMZeroBasedBitfieldMask : PatLeaf <(imm), [{
+  return isMask_32(N->getZExtValue());
+}]>;
+
+def IMMPopCount : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+class BFEPattern <Instruction BFE, Instruction MOV> : Pat <
+  (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)),
+  (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
+>;
+
+// rotr pattern
+class ROTRPattern <Instruction BIT_ALIGN> : Pat <
+  (rotr i32:$src0, i32:$src1),
+  (BIT_ALIGN $src0, $src0, $src1)
+>;
+
+// 24-bit arithmetic patterns
+def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>;
+
+// Special conversion patterns
+
+def cvt_rpi_i32_f32 : PatFrag <
+  (ops node:$src),
+  (fp_to_sint (ffloor (fadd $src, FP_HALF))),
+  [{ (void) N; return TM.Options.NoNaNsFPMath; }]
+>;
+
+def cvt_flr_i32_f32 : PatFrag <
+  (ops node:$src),
+  (fp_to_sint (ffloor $src)),
+  [{ (void)N; return TM.Options.NoNaNsFPMath; }]
+>;
+
+/*
+class UMUL24Pattern <Instruction UMUL24> : Pat <
+  (mul U24:$x, U24:$y),
+  (UMUL24 $x, $y)
+>;
+*/
+
+class IMad24Pat<Instruction Inst> : Pat <
+  (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
+  (Inst $src0, $src1, $src2)
+>;
+
+class UMad24Pat<Instruction Inst> : Pat <
+  (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2),
+  (Inst $src0, $src1, $src2)
+>;
+
+multiclass Expand24IBitOps<Instruction MulInst, Instruction AddInst> {
+  def _expand_imad24 : Pat <
+    (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2),
+    (AddInst (MulInst $src0, $src1), $src2)
+  >;
+
+  def _expand_imul24 : Pat <
+    (AMDGPUmul_i24 i32:$src0, i32:$src1),
+    (MulInst $src0, $src1)
+  >;
+}
+
+multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> {
+  def _expand_umad24 : Pat <
+    (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2),
+    (AddInst (MulInst $src0, $src1), $src2)
+  >;
+
+  def _expand_umul24 : Pat <
+    (AMDGPUmul_u24 i32:$src0, i32:$src1),
+    (MulInst $src0, $src1)
+  >;
+}
+
+class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
+  (fdiv FP_ONE, vt:$src),
+  (RcpInst $src)
+>;
+
+class RsqPat<Instruction RsqInst, ValueType vt> : Pat <
+  (AMDGPUrcp (fsqrt vt:$src)),
+  (RsqInst $src)
+>;
+
+include "R600Instructions.td"
+include "R700Instructions.td"
+include "EvergreenInstructions.td"
+include "CaymanInstructions.td"
+
+include "SIInstrInfo.td"
+
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
new file mode 100644
index 00000000000..e94bb6013d8
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -0,0 +1,77 @@
+//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Implementation of the IntrinsicInfo class.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+
+AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo()
+    : TargetIntrinsicInfo() {}
+
+std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+                                         unsigned numTys) const {
+  static const char *const names[] = {
+#define GET_INTRINSIC_NAME_TABLE
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_NAME_TABLE
+  };
+
+  if (IntrID < Intrinsic::num_intrinsics) {
+    return nullptr;
+  }
+  assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
+         "Invalid intrinsic ID");
+
+  std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
+  return Result;
+}
+
+unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name,
+                                         unsigned Len) const {
+  if (!StringRef(Name, Len).startswith("llvm."))
+    return 0; // All intrinsics start with 'llvm.'
+
+#define GET_FUNCTION_RECOGNIZER
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_FUNCTION_RECOGNIZER
+  AMDGPUIntrinsic::ID IntrinsicID =
+      (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
+  IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name);
+
+  if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) {
+    return IntrinsicID;
+  }
+  return 0;
+}
+
+bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
+// Overload Table
+#define GET_INTRINSIC_OVERLOAD_TABLE
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_OVERLOAD_TABLE
+}
+
+Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+                                              Type **Tys,
+                                              unsigned numTys) const {
+  llvm_unreachable("Not implemented");
+}
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
new file mode 100644
index 00000000000..4c95b5ec097
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -0,0 +1,48 @@
+//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
+//
+//===-----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H
+#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H
+
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+namespace llvm {
+class TargetMachine;
+
+namespace AMDGPUIntrinsic {
+enum ID {
+  last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
+#define GET_INTRINSIC_ENUM_VALUES
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_ENUM_VALUES
+      , num_AMDGPU_intrinsics
+};
+
+} // end namespace AMDGPUIntrinsic
+
+class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
+public:
+  AMDGPUIntrinsicInfo();
+  std::string getName(unsigned IntrId, Type **Tys = nullptr,
+                      unsigned numTys = 0) const override;
+  unsigned lookupName(const char *Name, unsigned Len) const override;
+  bool isOverloaded(unsigned IID) const override;
+  Function *getDeclaration(Module *M, unsigned ID,
+                           Type **Tys = nullptr,
+                           unsigned numTys = 0) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
new file mode 100644
index 00000000000..ab489cd2a4a
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -0,0 +1,90 @@
+//===-- AMDGPUIntrinsics.td - Common intrinsics  -*- tablegen -*-----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines intrinsics that are used by all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "AMDGPU", isTarget = 1 in {
+
+  def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+
+  // This is named backwards (instead of rsq_legacy) so we don't have
+  // to define it with the public builtins intrinsics. This is a
+  // workaround for how intrinsic names are parsed. If the name is
+  // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant
+  // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name.
+  def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+  def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
+  def int_AMDGPU_kilp : Intrinsic<[], [], []>;
+  def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+  def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_barrier_local  : Intrinsic<[], [], []>;
+  def int_AMDGPU_barrier_global  : Intrinsic<[], [], []>;
+}
+
+// Legacy names for compatibility.
+let TargetPrefix = "AMDIL", isTarget = 1 in {
+  def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+}
+
+let TargetPrefix = "TGSI", isTarget = 1 in {
+
+  def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>;
+}
+
+include "SIIntrinsics.td"
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
new file mode 100644
index 00000000000..20831460b93
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -0,0 +1,154 @@
+//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPUMCInstLower.h"
+#include "AMDGPUAsmPrinter.h"
+#include "AMDGPUTargetMachine.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "R600InstrInfo.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include <algorithm>
+
+using namespace llvm;
+
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st):
+  Ctx(ctx), ST(st)
+{ }
+
+void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+
+  int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
+
+  if (MCOpcode == -1) {
+    LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
+    C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
+                "a target-specific version: " + Twine(MI->getOpcode()));
+  }
+
+  OutMI.setOpcode(MCOpcode);
+
+  for (const MachineOperand &MO : MI->explicit_operands()) {
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::createImm(MO.getImm());
+      break;
+    case MachineOperand::MO_Register:
+      MCOp = MCOperand::createReg(MO.getReg());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
+                                   MO.getMBB()->getSymbol(), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress: {
+      const GlobalValue *GV = MO.getGlobal();
+      MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName()));
+      MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx));
+      break;
+    }
+    case MachineOperand::MO_TargetIndex: {
+      assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START);
+      MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
+      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
+      MCOp = MCOperand::createExpr(Expr);
+      break;
+    }
+    case MachineOperand::MO_ExternalSymbol: {
+      MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
+      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
+      MCOp = MCOperand::createExpr(Expr);
+      break;
+    }
+    }
+    OutMI.addOperand(MCOp);
+  }
+}
+
+void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+  AMDGPUMCInstLower MCInstLowering(OutContext, STI);
+
+#ifdef _DEBUG
+  StringRef Err;
+  if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) {
+    errs() << "Warning: Illegal instruction detected: " << Err << "\n";
+    MI->dump();
+  }
+#endif
+  if (MI->isBundle()) {
+    const MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::const_instr_iterator I = MI;
+    ++I;
+    while (I != MBB->end() && I->isInsideBundle()) {
+      EmitInstruction(I);
+      ++I;
+    }
+  } else {
+    MCInst TmpInst;
+    MCInstLowering.lower(MI, TmpInst);
+    EmitToStreamer(*OutStreamer, TmpInst);
+
+    if (STI.dumpCode()) {
+      // Disassemble instruction/operands to text.
+      DisasmLines.resize(DisasmLines.size() + 1);
+      std::string &DisasmLine = DisasmLines.back();
+      raw_string_ostream DisasmStream(DisasmLine);
+
+      AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(),
+                                    *MF->getSubtarget().getInstrInfo(),
+                                    *MF->getSubtarget().getRegisterInfo());
+      InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(),
+                            MF->getSubtarget());
+
+      // Disassemble instruction/operands to hex representation.
+      SmallVector<MCFixup, 4> Fixups;
+      SmallVector<char, 16> CodeBytes;
+      raw_svector_ostream CodeStream(CodeBytes);
+
+      auto &ObjStreamer = static_cast<MCObjectStreamer&>(*OutStreamer);
+      MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
+      InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups,
+                                    MF->getSubtarget<MCSubtargetInfo>());
+      CodeStream.flush();
+
+      HexLines.resize(HexLines.size() + 1);
+      std::string &HexLine = HexLines.back();
+      raw_string_ostream HexStream(HexLine);
+
+      for (size_t i = 0; i < CodeBytes.size(); i += 4) {
+        unsigned int CodeDWord = *(unsigned int *)&CodeBytes[i];
+        HexStream << format("%s%08X", (i > 0 ? " " : ""), CodeDWord);
+      }
+
+      DisasmStream.flush();
+      DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLine.size());
+    }
+  }
+}
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/lib/Target/AMDGPU/AMDGPUMCInstLower.h
new file mode 100644
index 00000000000..d322fe072b2
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -0,0 +1,35 @@
+//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H
+#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+class MachineInstr;
+class MCContext;
+class MCInst;
+
+class AMDGPUMCInstLower {
+  MCContext &Ctx;
+  const AMDGPUSubtarget &ST;
+
+public:
+  AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST);
+
+  /// \brief Lower a MachineInstr to an MCInst
+  void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
new file mode 100644
index 00000000000..21c7da66323
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -0,0 +1,25 @@
+#include "AMDGPUMachineFunction.h"
+#include "AMDGPU.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+using namespace llvm;
+
+static const char *const ShaderTypeAttribute = "ShaderType";
+
+// Pin the vtable to this file.
+void AMDGPUMachineFunction::anchor() {}
+
+AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
+  MachineFunctionInfo(),
+  ShaderType(ShaderType::COMPUTE),
+  LDSSize(0),
+  ScratchSize(0),
+  IsKernel(true) {
+  Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute);
+
+  if (A.isStringAttribute()) {
+    StringRef Str = A.getValueAsString();
+    if (Str.getAsInteger(0, ShaderType))
+      llvm_unreachable("Can't parse shader type!");
+  }
+}
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
new file mode 100644
index 00000000000..f5e4694e76f
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -0,0 +1,45 @@
+//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H
+#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include <map>
+
+namespace llvm {
+
+class AMDGPUMachineFunction : public MachineFunctionInfo {
+  virtual void anchor();
+  unsigned ShaderType;
+
+public:
+  AMDGPUMachineFunction(const MachineFunction &MF);
+  /// A map to keep track of local memory objects and their offsets within
+  /// the local memory space.
+  std::map<const GlobalValue *, unsigned> LocalMemoryObjects;
+  /// Number of bytes in the LDS that are being used.
+  unsigned LDSSize;
+
+  /// Start of implicit kernel args
+  unsigned ABIArgOffset;
+
+  unsigned getShaderType() const {
+    return ShaderType;
+  }
+
+  unsigned ScratchSize;
+  bool IsKernel;
+};
+
+}
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
new file mode 100644
index 00000000000..4a65bfc57f1
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -0,0 +1,407 @@
+//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates allocas by either converting them into vectors or
+// by migrating them to local address space.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-promote-alloca"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUPromoteAlloca : public FunctionPass,
+                       public InstVisitor<AMDGPUPromoteAlloca> {
+
+  static char ID;
+  Module *Mod;
+  const AMDGPUSubtarget &ST;
+  int LocalMemAvailable;
+
+public:
+  AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
+                                                   LocalMemAvailable(0) { }
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+  const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
+  void visitAlloca(AllocaInst &I);
+};
+
+} // End anonymous namespace
+
+char AMDGPUPromoteAlloca::ID = 0;
+
+bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
+  Mod = &M;
+  return false;
+}
+
+bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
+
+  const FunctionType *FTy = F.getFunctionType();
+
+  LocalMemAvailable = ST.getLocalMemorySize();
+
+
+  // If the function has any arguments in the local address space, then it's
+  // possible these arguments require the entire local memory space, so
+  // we cannot use local memory in the pass.
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
+    const Type *ParamTy = FTy->getParamType(i);
+    if (ParamTy->isPointerTy() &&
+        ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+      LocalMemAvailable = 0;
+      DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
+                      "local memory disabled.\n");
+      break;
+    }
+  }
+
+  if (LocalMemAvailable > 0) {
+    // Check how much local memory is being used by global objects
+    for (Module::global_iterator I = Mod->global_begin(),
+                                 E = Mod->global_end(); I != E; ++I) {
+      GlobalVariable *GV = I;
+      PointerType *GVTy = GV->getType();
+      if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+        continue;
+      for (Value::use_iterator U = GV->use_begin(),
+                               UE = GV->use_end(); U != UE; ++U) {
+        Instruction *Use = dyn_cast<Instruction>(*U);
+        if (!Use)
+          continue;
+        if (Use->getParent()->getParent() == &F)
+          LocalMemAvailable -=
+              Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType());
+      }
+    }
+  }
+
+  LocalMemAvailable = std::max(0, LocalMemAvailable);
+  DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
+
+  visit(F);
+
+  return false;
+}
+
+static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
+  return VectorType::get(ArrayTy->getArrayElementType(),
+                         ArrayTy->getArrayNumElements());
+}
+
+static Value *
+calculateVectorIndex(Value *Ptr,
+                     const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
+  if (isa<AllocaInst>(Ptr))
+    return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
+
+  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+
+  auto I = GEPIdx.find(GEP);
+  return I == GEPIdx.end() ? nullptr : I->second;
+}
+
+static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
+  // FIXME we only support simple cases
+  if (GEP->getNumOperands() != 3)
+    return NULL;
+
+  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
+  if (!I0 || !I0->isZero())
+    return NULL;
+
+  return GEP->getOperand(2);
+}
+
+// Not an instruction handled below to turn into a vector.
+//
+// TODO: Check isTriviallyVectorizable for calls and handle other
+// instructions.
+static bool canVectorizeInst(Instruction *Inst) {
+  switch (Inst->getOpcode()) {
+  case Instruction::Load:
+  case Instruction::Store:
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+  Type *AllocaTy = Alloca->getAllocatedType();
+
+  DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
+
+  // FIXME: There is no reason why we can't support larger arrays, we
+  // are just being conservative for now.
+  if (!AllocaTy->isArrayTy() ||
+      AllocaTy->getArrayElementType()->isVectorTy() ||
+      AllocaTy->getArrayNumElements() > 4) {
+
+    DEBUG(dbgs() << "  Cannot convert type to vector");
+    return false;
+  }
+
+  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
+  std::vector<Value*> WorkList;
+  for (User *AllocaUser : Alloca->users()) {
+    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
+    if (!GEP) {
+      if (!canVectorizeInst(cast<Instruction>(AllocaUser)))
+        return false;
+
+      WorkList.push_back(AllocaUser);
+      continue;
+    }
+
+    Value *Index = GEPToVectorIndex(GEP);
+
+    // If we can't compute a vector index from this GEP, then we can't
+    // promote this alloca to vector.
+    if (!Index) {
+      DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
+      return false;
+    }
+
+    GEPVectorIdx[GEP] = Index;
+    for (User *GEPUser : AllocaUser->users()) {
+      if (!canVectorizeInst(cast<Instruction>(GEPUser)))
+        return false;
+
+      WorkList.push_back(GEPUser);
+    }
+  }
+
+  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
+
+  DEBUG(dbgs() << "  Converting alloca to vector "
+        << *AllocaTy << " -> " << *VectorTy << '\n');
+
+  for (std::vector<Value*>::iterator I = WorkList.begin(),
+                                     E = WorkList.end(); I != E; ++I) {
+    Instruction *Inst = cast<Instruction>(*I);
+    IRBuilder<> Builder(Inst);
+    switch (Inst->getOpcode()) {
+    case Instruction::Load: {
+      Value *Ptr = Inst->getOperand(0);
+      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+      Value *VecValue = Builder.CreateLoad(BitCast);
+      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+      Inst->replaceAllUsesWith(ExtractElement);
+      Inst->eraseFromParent();
+      break;
+    }
+    case Instruction::Store: {
+      Value *Ptr = Inst->getOperand(1);
+      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+      Value *VecValue = Builder.CreateLoad(BitCast);
+      Value *NewVecValue = Builder.CreateInsertElement(VecValue,
+                                                       Inst->getOperand(0),
+                                                       Index);
+      Builder.CreateStore(NewVecValue, BitCast);
+      Inst->eraseFromParent();
+      break;
+    }
+    case Instruction::BitCast:
+    case Instruction::AddrSpaceCast:
+      break;
+
+    default:
+      Inst->dump();
+      llvm_unreachable("Inconsistency in instructions promotable to vector");
+    }
+  }
+  return true;
+}
+
+static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
+  bool Success = true;
+  for (User *User : Val->users()) {
+    if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
+      continue;
+    if (isa<CallInst>(User)) {
+      WorkList.push_back(User);
+      continue;
+    }
+
+    // FIXME: Correctly handle ptrtoint instructions.
+    Instruction *UseInst = dyn_cast<Instruction>(User);
+    if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
+      return false;
+
+    if (!User->getType()->isPointerTy())
+      continue;
+
+    WorkList.push_back(User);
+
+    Success &= collectUsesWithPtrTypes(User, WorkList);
+  }
+  return Success;
+}
+
+void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
+  IRBuilder<> Builder(&I);
+
+  // First try to replace the alloca with a vector
+  Type *AllocaTy = I.getAllocatedType();
+
+  DEBUG(dbgs() << "Trying to promote " << I << '\n');
+
+  if (tryPromoteAllocaToVector(&I))
+    return;
+
+  DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
+
+  // FIXME: This is the maximum work group size.  We should try to get
+  // value from the reqd_work_group_size function attribute if it is
+  // available.
+  unsigned WorkGroupSize = 256;
+  int AllocaSize =
+      WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
+
+  if (AllocaSize > LocalMemAvailable) {
+    DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
+    return;
+  }
+
+  std::vector<Value*> WorkList;
+
+  if (!collectUsesWithPtrTypes(&I, WorkList)) {
+    DEBUG(dbgs() << " Do not know how to convert all uses\n");
+    return;
+  }
+
+  DEBUG(dbgs() << "Promoting alloca to local memory\n");
+  LocalMemAvailable -= AllocaSize;
+
+  Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
+  GlobalVariable *GV = new GlobalVariable(
+      *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0,
+      GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
+
+  FunctionType *FTy = FunctionType::get(
+      Type::getInt32Ty(Mod->getContext()), false);
+  AttributeSet AttrSet;
+  AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
+
+  Value *ReadLocalSizeY = Mod->getOrInsertFunction(
+      "llvm.r600.read.local.size.y", FTy, AttrSet);
+  Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
+      "llvm.r600.read.local.size.z", FTy, AttrSet);
+  Value *ReadTIDIGX = Mod->getOrInsertFunction(
+      "llvm.r600.read.tidig.x", FTy, AttrSet);
+  Value *ReadTIDIGY = Mod->getOrInsertFunction(
+      "llvm.r600.read.tidig.y", FTy, AttrSet);
+  Value *ReadTIDIGZ = Mod->getOrInsertFunction(
+      "llvm.r600.read.tidig.z", FTy, AttrSet);
+
+  Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {});
+  Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {});
+  Value *TIdX = Builder.CreateCall(ReadTIDIGX, {});
+  Value *TIdY = Builder.CreateCall(ReadTIDIGY, {});
+  Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {});
+
+  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
+  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
+  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
+  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
+  TID = Builder.CreateAdd(TID, TIdZ);
+
+  std::vector<Value*> Indices;
+  Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
+  Indices.push_back(TID);
+
+  Value *Offset = Builder.CreateGEP(GVTy, GV, Indices);
+  I.mutateType(Offset->getType());
+  I.replaceAllUsesWith(Offset);
+  I.eraseFromParent();
+
+  for (std::vector<Value*>::iterator i = WorkList.begin(),
+                                     e = WorkList.end(); i != e; ++i) {
+    Value *V = *i;
+    CallInst *Call = dyn_cast<CallInst>(V);
+    if (!Call) {
+      Type *EltTy = V->getType()->getPointerElementType();
+      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+
+      // The operand's value should be corrected on its own.
+      if (isa<AddrSpaceCastInst>(V))
+        continue;
+
+      // FIXME: It doesn't really make sense to try to do this for all
+      // instructions.
+      V->mutateType(NewTy);
+      continue;
+    }
+
+    IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
+    if (!Intr) {
+      std::vector<Type*> ArgTypes;
+      for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
+                                ArgIdx != ArgEnd; ++ArgIdx) {
+        ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
+      }
+      Function *F = Call->getCalledFunction();
+      FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
+                                                F->isVarArg());
+      Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
+                                             NewType, F->getAttributes());
+      Function *NewF = cast<Function>(C);
+      Call->setCalledFunction(NewF);
+      continue;
+    }
+
+    Builder.SetInsertPoint(Intr);
+    switch (Intr->getIntrinsicID()) {
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      // These intrinsics are for address space 0 only
+      Intr->eraseFromParent();
+      continue;
+    case Intrinsic::memcpy: {
+      MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
+      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
+                           MemCpy->getLength(), MemCpy->getAlignment(),
+                           MemCpy->isVolatile());
+      Intr->eraseFromParent();
+      continue;
+    }
+    case Intrinsic::memset: {
+      MemSetInst *MemSet = cast<MemSetInst>(Intr);
+      Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
+                           MemSet->getLength(), MemSet->getAlignment(),
+                           MemSet->isVolatile());
+      Intr->eraseFromParent();
+      continue;
+    }
+    default:
+      Intr->dump();
+      llvm_unreachable("Don't know how to promote alloca intrinsic use.");
+    }
+  }
+}
+
+FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
+  return new AMDGPUPromoteAlloca(ST);
+}
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
new file mode 100644
index 00000000000..3ca0eca3417
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -0,0 +1,63 @@
+//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Parent TargetRegisterInfo class common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+
+using namespace llvm;
+
+AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
+
+//===----------------------------------------------------------------------===//
+// Function handling callbacks - Functions are a seldom used feature of GPUS, so
+// they are not supported at this time.
+//===----------------------------------------------------------------------===//
+
+const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
+
+const MCPhysReg*
+AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  return &CalleeSavedReg;
+}
+
+void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                             int SPAdj,
+                                             unsigned FIOperandNum,
+                                             RegScavenger *RS) const {
+  llvm_unreachable("Subroutines not supported yet");
+}
+
+unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return AMDGPU::NoRegister;
+}
+
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
+  static const unsigned SubRegs[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
+    AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
+    AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14,
+    AMDGPU::sub15
+  };
+
+  assert(Channel < array_lengthof(SubRegs));
+  return SubRegs[Channel];
+}
+
+unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const {
+
+  return getSubRegFromChannel(IndirectIndex);
+}
+
+#define GET_REGINFO_TARGET_DESC
+#include "AMDGPUGenRegisterInfo.inc"
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
new file mode 100644
index 00000000000..cfd800bdc70
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -0,0 +1,64 @@
+//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief TargetRegisterInfo interface that is implemented by all hw codegen
+/// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H
+#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+class TargetInstrInfo;
+
+struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
+  static const MCPhysReg CalleeSavedReg;
+
+  AMDGPURegisterInfo();
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override {
+    assert(!"Unimplemented");  return BitVector();
+  }
+
+  virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
+    assert(!"Unimplemented"); return nullptr;
+  }
+
+  virtual unsigned getHWRegIndex(unsigned Reg) const {
+    assert(!"Unimplemented"); return 0;
+  }
+
+  /// \returns the sub reg enum value for the given \p Channel
+  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
+  unsigned getSubRegFromChannel(unsigned Channel) const;
+
+  const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS) const override;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  unsigned getIndirectSubReg(unsigned IndirectIndex) const;
+
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
new file mode 100644
index 00000000000..835a1464395
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
@@ -0,0 +1,26 @@
+//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tablegen register definitions common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "AMDGPU" in {
+
+foreach Index = 0-15 in {
+  // Indices are used in a variety of ways here, so don't set a size/offset.
+  def sub#Index : SubRegIndex<-1, -1>;
+}
+
+def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">;
+
+}
+
+include "R600RegisterInfo.td"
+include "SIRegisterInfo.td"
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
new file mode 100644
index 00000000000..605ccd0e136
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -0,0 +1,133 @@
+//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUSubtarget.h"
+#include "R600ISelLowering.h"
+#include "R600InstrInfo.h"
+#include "R600MachineScheduler.h"
+#include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-subtarget"
+
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "AMDGPUGenSubtargetInfo.inc"
+
+AMDGPUSubtarget &
+AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
+                                                 StringRef GPU, StringRef FS) {
+  // Determine default and user-specified characteristics
+  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
+  // enabled, but some instructions do not respect them and they run at the
+  // double precision rate, so don't enable by default.
+  //
+  // We want to be able to turn these off, but making this a subtarget feature
+  // for SI has the unhelpful behavior that it unsets everything else if you
+  // disable it.
+
+  SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
+  FullFS += FS;
+
+  if (GPU == "" && TT.getArch() == Triple::amdgcn)
+    GPU = "SI";
+
+  ParseSubtargetFeatures(GPU, FullFS);
+
+  // FIXME: I don't think think Evergreen has any useful support for
+  // denormals, but should be checked. Should we issue a warning somewhere
+  // if someone tries to enable these?
+  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    FP32Denormals = false;
+    FP64Denormals = false;
+  }
+  return *this;
+}
+
+AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                                 TargetMachine &TM)
+    : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false),
+      DumpCode(false), R600ALUInst(false), HasVertexCache(false),
+      TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
+      FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
+      CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),
+      EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false),
+      WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
+      EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
+      GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
+      FrameLowering(TargetFrameLowering::StackGrowsUp,
+                    64 * 16, // Maximum stack alignment (long16)
+                    0),
+      InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
+
+  initializeSubtargetDependencies(TT, GPU, FS);
+
+  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    InstrInfo.reset(new R600InstrInfo(*this));
+    TLInfo.reset(new R600TargetLowering(TM, *this));
+  } else {
+    InstrInfo.reset(new SIInstrInfo(*this));
+    TLInfo.reset(new SITargetLowering(TM, *this));
+  }
+}
+
+unsigned AMDGPUSubtarget::getStackEntrySize() const {
+  assert(getGeneration() <= NORTHERN_ISLANDS);
+  switch(getWavefrontSize()) {
+  case 16:
+    return 8;
+  case 32:
+    return hasCaymanISA() ? 4 : 8;
+  case 64:
+    return 4;
+  default:
+    llvm_unreachable("Illegal wavefront size.");
+  }
+}
+
+unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const {
+  switch(getGeneration()) {
+  default: llvm_unreachable("ChipID unknown");
+  case SEA_ISLANDS: return 12;
+  }
+}
+
+bool AMDGPUSubtarget::isVGPRSpillingEnabled(
+                                       const SIMachineFunctionInfo *MFI) const {
+  return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling;
+}
+
+void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                          MachineInstr *begin,
+                                          MachineInstr *end,
+                                          unsigned NumRegionInstrs) const {
+  if (getGeneration() >= SOUTHERN_ISLANDS) {
+
+    // Track register pressure so the scheduler can try to decrease
+    // pressure once register usage is above the threshold defined by
+    // SIRegisterInfo::getRegPressureSetLimit()
+    Policy.ShouldTrackPressure = true;
+
+    // Enabling both top down and bottom up scheduling seems to give us less
+    // register spills than just using one of these approaches on its own.
+    Policy.OnlyTopDown = false;
+    Policy.OnlyBottomUp = false;
+  }
+}
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
new file mode 100644
index 00000000000..0d40d14f820
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -0,0 +1,282 @@
+//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
+#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
+#include "AMDGPU.h"
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600ISelLowering.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AMDGPUGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class SIMachineFunctionInfo;
+
+class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+
+public:
+  enum Generation {
+    R600 = 0,
+    R700,
+    EVERGREEN,
+    NORTHERN_ISLANDS,
+    SOUTHERN_ISLANDS,
+    SEA_ISLANDS,
+    VOLCANIC_ISLANDS,
+  };
+
+  enum {
+    FIXED_SGPR_COUNT_FOR_INIT_BUG = 80
+  };
+
+private:
+  std::string DevName;
+  bool Is64bit;
+  bool DumpCode;
+  bool R600ALUInst;
+  bool HasVertexCache;
+  short TexVTXClauseSize;
+  Generation Gen;
+  bool FP64;
+  bool FP64Denormals;
+  bool FP32Denormals;
+  bool FastFMAF32;
+  bool CaymanISA;
+  bool FlatAddressSpace;
+  bool EnableIRStructurizer;
+  bool EnablePromoteAlloca;
+  bool EnableIfCvt;
+  bool EnableLoadStoreOpt;
+  unsigned WavefrontSize;
+  bool CFALUBug;
+  int LocalMemorySize;
+  bool EnableVGPRSpilling;
+  bool SGPRInitBug;
+  bool IsGCN;
+  bool GCN1Encoding;
+  bool GCN3Encoding;
+  bool CIInsts;
+  bool FeatureDisable;
+  int LDSBankCount;
+
+  AMDGPUFrameLowering FrameLowering;
+  std::unique_ptr<AMDGPUTargetLowering> TLInfo;
+  std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
+  InstrItineraryData InstrItins;
+  Triple TargetTriple;
+
+public:
+  AMDGPUSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                  TargetMachine &TM);
+  AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT,
+                                                   StringRef GPU, StringRef FS);
+
+  const AMDGPUFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const AMDGPUInstrInfo *getInstrInfo() const override {
+    return InstrInfo.get();
+  }
+  const AMDGPURegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo->getRegisterInfo();
+  }
+  AMDGPUTargetLowering *getTargetLowering() const override {
+    return TLInfo.get();
+  }
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool is64bit() const {
+    return Is64bit;
+  }
+
+  bool hasVertexCache() const {
+    return HasVertexCache;
+  }
+
+  short getTexVTXClauseSize() const {
+    return TexVTXClauseSize;
+  }
+
+  Generation getGeneration() const {
+    return Gen;
+  }
+
+  bool hasHWFP64() const {
+    return FP64;
+  }
+
+  bool hasCaymanISA() const {
+    return CaymanISA;
+  }
+
+  bool hasFP32Denormals() const {
+    return FP32Denormals;
+  }
+
+  bool hasFP64Denormals() const {
+    return FP64Denormals;
+  }
+
+  bool hasFastFMAF32() const {
+    return FastFMAF32;
+  }
+
+  bool hasFlatAddressSpace() const {
+    return FlatAddressSpace;
+  }
+
+  bool hasBFE() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBFI() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBFM() const {
+    return hasBFE();
+  }
+
+  bool hasBCNT(unsigned Size) const {
+    if (Size == 32)
+      return (getGeneration() >= EVERGREEN);
+
+    if (Size == 64)
+      return (getGeneration() >= SOUTHERN_ISLANDS);
+
+    return false;
+  }
+
+  bool hasMulU24() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasMulI24() const {
+    return (getGeneration() >= SOUTHERN_ISLANDS ||
+            hasCaymanISA());
+  }
+
+  bool hasFFBL() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasFFBH() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasCARRY() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBORROW() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool IsIRStructurizerEnabled() const {
+    return EnableIRStructurizer;
+  }
+
+  bool isPromoteAllocaEnabled() const {
+    return EnablePromoteAlloca;
+  }
+
+  bool isIfCvtEnabled() const {
+    return EnableIfCvt;
+  }
+
+  bool loadStoreOptEnabled() const {
+    return EnableLoadStoreOpt;
+  }
+
+  unsigned getWavefrontSize() const {
+    return WavefrontSize;
+  }
+
+  unsigned getStackEntrySize() const;
+
+  bool hasCFAluBug() const {
+    assert(getGeneration() <= NORTHERN_ISLANDS);
+    return CFALUBug;
+  }
+
+  int getLocalMemorySize() const {
+    return LocalMemorySize;
+  }
+
+  bool hasSGPRInitBug() const {
+    return SGPRInitBug;
+  }
+
+  int getLDSBankCount() const {
+    return LDSBankCount;
+  }
+
+  unsigned getAmdKernelCodeChipID() const;
+
+  bool enableMachineScheduler() const override {
+    return true;
+  }
+
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           MachineInstr *begin, MachineInstr *end,
+                           unsigned NumRegionInstrs) const override;
+
+  // Helper functions to simplify if statements
+  bool isTargetELF() const {
+    return false;
+  }
+
+  StringRef getDeviceName() const {
+    return DevName;
+  }
+
+  bool dumpCode() const {
+    return DumpCode;
+  }
+  bool r600ALUEncoding() const {
+    return R600ALUInst;
+  }
+  bool isAmdHsaOS() const {
+    return TargetTriple.getOS() == Triple::AMDHSA;
+  }
+  bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const;
+
+  unsigned getMaxWavesPerCU() const {
+    if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      return 10;
+
+    // FIXME: Not sure what this is for other subtagets.
+    llvm_unreachable("do not know max waves per CU for this subtarget.");
+  }
+
+  bool enableSubRegLiveness() const override {
+    return true;
+  }
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
new file mode 100644
index 00000000000..a9a911a8efe
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -0,0 +1,292 @@
+//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The AMDGPU target machine contains all of the hardware specific
+/// information  needed to emit code for R600 and SI GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
+#include "AMDGPUTargetTransformInfo.h"
+#include "R600ISelLowering.h"
+#include "R600InstrInfo.h"
+#include "R600MachineScheduler.h"
+#include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include <llvm/CodeGen/Passes.h>
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeAMDGPUTarget() {
+  // Register the target
+  RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
+  RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget);
+}
+
+static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>());
+}
+
+static MachineSchedRegistry
+SchedCustomRegistry("r600", "Run R600's custom scheduler",
+                    createR600MachineScheduler);
+
+static std::string computeDataLayout(const Triple &TT) {
+  std::string Ret = "e-p:32:32";
+
+  if (TT.getArch() == Triple::amdgcn) {
+    // 32-bit private, local, and region pointers. 64-bit global and constant.
+    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
+  }
+
+  Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
+         "-v512:512-v1024:1024-v2048:2048-n32:64";
+
+  return Ret;
+}
+
+AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
+                                         StringRef CPU, StringRef FS,
+                                         TargetOptions Options, Reloc::Model RM,
+                                         CodeModel::Model CM,
+                                         CodeGenOpt::Level OptLevel)
+    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM,
+                        OptLevel),
+      TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this),
+      IntrinsicInfo() {
+  setRequiresStructuredCFG(true);
+  initAsmInfo();
+}
+
+AMDGPUTargetMachine::~AMDGPUTargetMachine() {
+  delete TLOF;
+}
+
+//===----------------------------------------------------------------------===//
+// R600 Target Machine (R600 -> Cayman)
+//===----------------------------------------------------------------------===//
+
+R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
+                                     StringRef FS, StringRef CPU,
+                                     TargetOptions Options, Reloc::Model RM,
+                                     CodeModel::Model CM, CodeGenOpt::Level OL)
+    : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {}
+
+//===----------------------------------------------------------------------===//
+// GCN Target Machine (SI+)
+//===----------------------------------------------------------------------===//
+
+GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
+                                   StringRef FS, StringRef CPU,
+                                   TargetOptions Options, Reloc::Model RM,
+                                   CodeModel::Model CM, CodeGenOpt::Level OL)
+    : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {}
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Pass Setup
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AMDGPUPassConfig : public TargetPassConfig {
+public:
+  AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
+    return getTM<AMDGPUTargetMachine>();
+  }
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
+    const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+      return createR600MachineScheduler(C);
+    return nullptr;
+  }
+
+  void addIRPasses() override;
+  void addCodeGenPrepare() override;
+  virtual bool addPreISel() override;
+  virtual bool addInstSelector() override;
+};
+
+class R600PassConfig : public AMDGPUPassConfig {
+public:
+  R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
+    : AMDGPUPassConfig(TM, PM) { }
+
+  bool addPreISel() override;
+  void addPreRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+};
+
+class GCNPassConfig : public AMDGPUPassConfig {
+public:
+  GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
+    : AMDGPUPassConfig(TM, PM) { }
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  void addPreRegAlloc() override;
+  void addPostRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+};
+
+} // End of anonymous namespace
+
+TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis(
+      [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); });
+}
+
+void AMDGPUPassConfig::addIRPasses() {
+  // Function calls are not supported, so make sure we inline everything.
+  addPass(createAMDGPUAlwaysInlinePass());
+  addPass(createAlwaysInlinerPass());
+  // We need to add the barrier noop pass, otherwise adding the function
+  // inlining pass will cause all of the PassConfigs passes to be run
+  // one function at a time, which means if we have a nodule with two
+  // functions, then we will generate code for the first function
+  // without ever running any passes on the second.
+  addPass(createBarrierNoopPass());
+  TargetPassConfig::addIRPasses();
+}
+
+void AMDGPUPassConfig::addCodeGenPrepare() {
+  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
+  if (ST.isPromoteAllocaEnabled()) {
+    addPass(createAMDGPUPromoteAlloca(ST));
+    addPass(createSROAPass());
+  }
+  TargetPassConfig::addCodeGenPrepare();
+}
+
+bool
+AMDGPUPassConfig::addPreISel() {
+  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
+  addPass(createFlattenCFGPass());
+  if (ST.IsIRStructurizerEnabled())
+    addPass(createStructurizeCFGPass());
+  return false;
+}
+
+bool AMDGPUPassConfig::addInstSelector() {
+  addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// R600 Pass Setup
+//===----------------------------------------------------------------------===//
+
+bool R600PassConfig::addPreISel() {
+  AMDGPUPassConfig::addPreISel();
+  addPass(createR600TextureIntrinsicsReplacer());
+  return false;
+}
+
+void R600PassConfig::addPreRegAlloc() {
+  addPass(createR600VectorRegMerger(*TM));
+}
+
+void R600PassConfig::addPreSched2() {
+  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
+  addPass(createR600EmitClauseMarkers(), false);
+  if (ST.isIfCvtEnabled())
+    addPass(&IfConverterID, false);
+  addPass(createR600ClauseMergePass(*TM), false);
+}
+
+void R600PassConfig::addPreEmitPass() {
+  addPass(createAMDGPUCFGStructurizerPass(), false);
+  addPass(createR600ExpandSpecialInstrsPass(*TM), false);
+  addPass(&FinalizeMachineBundlesID, false);
+  addPass(createR600Packetizer(*TM), false);
+  addPass(createR600ControlFlowFinalizer(*TM), false);
+}
+
+TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new R600PassConfig(this, PM);
+}
+
+//===----------------------------------------------------------------------===//
+// GCN Pass Setup
+//===----------------------------------------------------------------------===//
+
+bool GCNPassConfig::addPreISel() {
+  AMDGPUPassConfig::addPreISel();
+  addPass(createSinkingPass());
+  addPass(createSITypeRewriter());
+  addPass(createSIAnnotateControlFlowPass());
+  return false;
+}
+
+bool GCNPassConfig::addInstSelector() {
+  AMDGPUPassConfig::addInstSelector();
+  addPass(createSILowerI1CopiesPass());
+  addPass(createSIFixSGPRCopiesPass(*TM));
+  addPass(createSIFoldOperandsPass());
+  return false;
+}
+
+void GCNPassConfig::addPreRegAlloc() {
+  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
+
+  // This needs to be run directly before register allocation because
+  // earlier passes might recompute live intervals.
+  // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
+  if (getOptLevel() > CodeGenOpt::None) {
+    initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry());
+    insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
+  }
+
+  if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) {
+    // Don't do this with no optimizations since it throws away debug info by
+    // merging nonadjacent loads.
+
+    // This should be run after scheduling, but before register allocation. It
+    // also need extra copies to the address operand to be eliminated.
+    initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
+    insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
+  }
+  addPass(createSIShrinkInstructionsPass(), false);
+  addPass(createSIFixSGPRLiveRangesPass(), false);
+}
+
+void GCNPassConfig::addPostRegAlloc() {
+  addPass(createSIPrepareScratchRegs(), false);
+  addPass(createSIShrinkInstructionsPass(), false);
+}
+
+void GCNPassConfig::addPreSched2() {
+  addPass(createSIInsertWaits(*TM), false);
+}
+
+void GCNPassConfig::addPreEmitPass() {
+  addPass(createSILowerControlFlowPass(*TM), false);
+}
+
+TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new GCNPassConfig(this, PM);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
new file mode 100644
index 00000000000..14792e347a7
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -0,0 +1,89 @@
+//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H
+#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H
+
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600ISelLowering.h"
+#include "llvm/IR/DataLayout.h"
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Target Machine (R600+)
+//===----------------------------------------------------------------------===//
+
+class AMDGPUTargetMachine : public LLVMTargetMachine {
+private:
+
+protected:
+  TargetLoweringObjectFile *TLOF;
+  AMDGPUSubtarget Subtarget;
+  AMDGPUIntrinsicInfo IntrinsicInfo;
+
+public:
+  AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef FS,
+                      StringRef CPU, TargetOptions Options, Reloc::Model RM,
+                      CodeModel::Model CM, CodeGenOpt::Level OL);
+  ~AMDGPUTargetMachine();
+
+  const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override {
+    return &Subtarget;
+  }
+  const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
+    return &IntrinsicInfo;
+  }
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// R600 Target Machine (R600 -> Cayman)
+//===----------------------------------------------------------------------===//
+
+class R600TargetMachine : public AMDGPUTargetMachine {
+
+public:
+  R600TargetMachine(const Target &T, const Triple &TT, StringRef FS,
+                    StringRef CPU, TargetOptions Options, Reloc::Model RM,
+                    CodeModel::Model CM, CodeGenOpt::Level OL);
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+};
+
+//===----------------------------------------------------------------------===//
+// GCN Target Machine (SI+)
+//===----------------------------------------------------------------------===//
+
+class GCNTargetMachine : public AMDGPUTargetMachine {
+
+public:
+  GCNTargetMachine(const Target &T, const Triple &TT, StringRef FS,
+                   StringRef CPU, TargetOptions Options, Reloc::Model RM,
+                   CodeModel::Model CM, CodeGenOpt::Level OL);
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
new file mode 100644
index 00000000000..6dacc742b12
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -0,0 +1,82 @@
+//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file implements a TargetTransformInfo analysis pass specific to the
+// AMDGPU target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetTransformInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "AMDGPUtti"
+
+void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
+                                            TTI::UnrollingPreferences &UP) {
+  UP.Threshold = 300; // Twice the default.
+  UP.MaxCount = UINT_MAX;
+  UP.Partial = true;
+
+  // TODO: Do we want runtime unrolling?
+
+  for (const BasicBlock *BB : L->getBlocks()) {
+    const DataLayout &DL = BB->getModule()->getDataLayout();
+    for (const Instruction &I : *BB) {
+      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
+      if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+        continue;
+
+      const Value *Ptr = GEP->getPointerOperand();
+      const AllocaInst *Alloca =
+          dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
+      if (Alloca) {
+        // We want to do whatever we can to limit the number of alloca
+        // instructions that make it through to the code generator.  allocas
+        // require us to use indirect addressing, which is slow and prone to
+        // compiler bugs.  If this loop does an address calculation on an
+        // alloca ptr, then we want to use a higher than normal loop unroll
+        // threshold. This will give SROA a better chance to eliminate these
+        // allocas.
+        //
+        // Don't use the maximum allowed value here as it will make some
+        // programs way too big.
+        UP.Threshold = 800;
+      }
+    }
+  }
+}
+
+unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
+  if (Vec)
+    return 0;
+
+  // Number of VGPRs on SI.
+  if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+    return 256;
+
+  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
+}
+
+unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; }
+
+unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+  // Semi-arbitrary large amount.
+  return 64;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
new file mode 100644
index 00000000000..791c84e6f28
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -0,0 +1,78 @@
+//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// AMDGPU target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> {
+  typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const AMDGPUSubtarget *ST;
+  const AMDGPUTargetLowering *TLI;
+
+  const AMDGPUSubtarget *getST() const { return ST; }
+  const AMDGPUTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM)
+      : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {}
+
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+  AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
+  AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) {
+    BaseT::operator=(static_cast<const BaseT &>(RHS));
+    ST = RHS.ST;
+    TLI = RHS.TLI;
+    return *this;
+  }
+  AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) {
+    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+    ST = std::move(RHS.ST);
+    TLI = std::move(RHS.TLI);
+    return *this;
+  }
+
+  bool hasBranchDivergence() { return true; }
+
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
+    assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+    return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software;
+  }
+
+  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getMaxInterleaveFactor(unsigned VF);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
new file mode 100644
index 00000000000..c9b25a1a0b8
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -0,0 +1,1912 @@
+//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <deque>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "structcfg"
+
+#define DEFAULT_VEC_SLOTS 8
+
+// TODO: move-begin.
+
+//===----------------------------------------------------------------------===//
+//
+// Statistics for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+
+STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
+    "matched");
+STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
+    "matched");
+STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
+    "pattern matched");
+STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
+STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
+
+namespace llvm {
+  void initializeAMDGPUCFGStructurizerPass(PassRegistry&);
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Miscellaneous utility for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+namespace {
+#define SHOWNEWINSTR(i) \
+  DEBUG(dbgs() << "New instr: " << *i << "\n");
+
+#define SHOWNEWBLK(b, msg) \
+DEBUG( \
+  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  dbgs() << "\n"; \
+);
+
+#define SHOWBLK_DETAIL(b, msg) \
+DEBUG( \
+  if (b) { \
+  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  b->print(dbgs()); \
+  dbgs() << "\n"; \
+  } \
+);
+
+#define INVALIDSCCNUM -1
+
+template<class NodeT>
+void ReverseVector(SmallVectorImpl<NodeT *> &Src) {
+  size_t sz = Src.size();
+  for (size_t i = 0; i < sz/2; ++i) {
+    NodeT *t = Src[i];
+    Src[i] = Src[sz - i - 1];
+    Src[sz - i - 1] = t;
+  }
+}
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//
+// supporting data structure for CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+
+namespace {
+
+class BlockInformation {
+public:
+  bool IsRetired;
+  int  SccNum;
+  BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {}
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AMDGPUCFGStructurizer : public MachineFunctionPass {
+public:
+  typedef SmallVector<MachineBasicBlock *, 32> MBBVector;
+  typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap;
+  typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap;
+
+  enum PathToKind {
+    Not_SinglePath = 0,
+    SinglePath_InPath = 1,
+    SinglePath_NotInPath = 2
+  };
+
+  static char ID;
+
+  AMDGPUCFGStructurizer() :
+      MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {
+    initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
+  }
+
+   const char *getPassName() const override {
+    return "AMDGPU Control Flow Graph structurizer Pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<MachineFunctionAnalysis>();
+    AU.addRequired<MachineFunctionAnalysis>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+  }
+
+  /// Perform the CFG structurization
+  bool run();
+
+  /// Perform the CFG preparation
+  /// This step will remove every unconditionnal/dead jump instructions and make
+  /// sure all loops have an exit block
+  bool prepare();
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+    TRI = &TII->getRegisterInfo();
+    DEBUG(MF.dump(););
+    OrderedBlks.clear();
+    Visited.clear();
+    FuncRep = &MF;
+    MLI = &getAnalysis<MachineLoopInfo>();
+    DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
+    MDT = &getAnalysis<MachineDominatorTree>();
+    DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr););
+    PDT = &getAnalysis<MachinePostDominatorTree>();
+    DEBUG(PDT->print(dbgs()););
+    prepare();
+    run();
+    DEBUG(MF.dump(););
+    return true;
+  }
+
+protected:
+  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *PDT;
+  MachineLoopInfo *MLI;
+  const R600InstrInfo *TII;
+  const AMDGPURegisterInfo *TRI;
+
+  // PRINT FUNCTIONS
+  /// Print the ordered Blocks.
+  void printOrderedBlocks() const {
+    size_t i = 0;
+    for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
+        iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
+      dbgs() << "BB" << (*iterBlk)->getNumber();
+      dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
+      if (i != 0 && i % 10 == 0) {
+        dbgs() << "\n";
+      } else {
+        dbgs() << " ";
+      }
+    }
+  }
+  static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
+    for (MachineLoop::iterator iter = LoopInfo.begin(),
+         iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
+      (*iter)->print(dbgs(), 0);
+    }
+  }
+
+  // UTILITY FUNCTIONS
+  int getSCCNum(MachineBasicBlock *MBB) const;
+  MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
+  bool hasBackEdge(MachineBasicBlock *MBB) const;
+  static unsigned getLoopDepth(MachineLoop *LoopRep);
+  bool isRetiredBlock(MachineBasicBlock *MBB) const;
+  bool isActiveLoophead(MachineBasicBlock *MBB) const;
+  PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+      bool AllowSideEntry = true) const;
+  int countActiveBlock(MBBVector::const_iterator It,
+      MBBVector::const_iterator E) const;
+  bool needMigrateBlock(MachineBasicBlock *MBB) const;
+
+  // Utility Functions
+  void reversePredicateSetter(MachineBasicBlock::iterator I);
+  /// Compute the reversed DFS post order of Blocks
+  void orderBlocks(MachineFunction *MF);
+
+  // Function originally from CFGStructTraits
+  void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
+      DebugLoc DL = DebugLoc());
+  MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
+    DebugLoc DL = DebugLoc());
+  MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
+  void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
+      DebugLoc DL);
+  void insertCondBranchBefore(MachineBasicBlock *MBB,
+      MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
+      DebugLoc DL);
+  void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum);
+  static int getBranchNzeroOpcode(int OldOpcode);
+  static int getBranchZeroOpcode(int OldOpcode);
+  static int getContinueNzeroOpcode(int OldOpcode);
+  static int getContinueZeroOpcode(int OldOpcode);
+  static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
+  static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
+  static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
+      MachineInstr *MI);
+  static bool isCondBranch(MachineInstr *MI);
+  static bool isUncondBranch(MachineInstr *MI);
+  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
+  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
+  /// The correct naming for this is getPossibleLoopendBlockBranchInstr.
+  ///
+  /// BB with backward-edge could have move instructions after the branch
+  /// instruction.  Such move instruction "belong to" the loop backward-edge.
+  MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
+  static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
+  static MachineInstr *getContinueInstr(MachineBasicBlock *MBB);
+  static bool isReturnBlock(MachineBasicBlock *MBB);
+  static void cloneSuccessorList(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *SrcMBB) ;
+  static MachineBasicBlock *clone(MachineBasicBlock *MBB);
+  /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
+  /// because the AMDGPU instruction is not recognized as terminator fix this
+  /// and retire this routine
+  void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
+      MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
+  static void wrapup(MachineBasicBlock *MBB);
+
+
+  int patternMatch(MachineBasicBlock *MBB);
+  int patternMatchGroup(MachineBasicBlock *MBB);
+  int serialPatternMatch(MachineBasicBlock *MBB);
+  int ifPatternMatch(MachineBasicBlock *MBB);
+  int loopendPatternMatch();
+  int mergeLoop(MachineLoop *LoopRep);
+  int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader);
+
+  void handleLoopcontBlock(MachineBasicBlock *ContingMBB,
+      MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
+      MachineLoop *ContLoop);
+  /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
+  /// the same loop with LoopLandInfo without explicitly keeping track of
+  /// loopContBlks and loopBreakBlks, this is a method to get the information.
+  bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
+      MachineBasicBlock *Src2MBB);
+  int handleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+  int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+  int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+      MachineBasicBlock **LandMBBPtr);
+  void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+      MachineBasicBlock *LandMBB, bool Detail = false);
+  int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+      MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
+  void mergeSerialBlock(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *SrcMBB);
+
+  void mergeIfthenelseBlock(MachineInstr *BranchMI,
+      MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+      MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
+  void mergeLooplandBlock(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *LandMBB);
+  void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+      MachineBasicBlock *LandMBB);
+  void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+      MachineBasicBlock *ContMBB);
+  /// normalizeInfiniteLoopExit change
+  ///   B1:
+  ///        uncond_br LoopHeader
+  ///
+  /// to
+  ///   B1:
+  ///        cond_br 1 LoopHeader dummyExit
+  /// and return the newly added dummy exit block
+  MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
+  void removeUnconditionalBranch(MachineBasicBlock *MBB);
+  /// Remove duplicate branches instructions in a block.
+  /// For instance
+  /// B0:
+  ///    cond_br X B1 B2
+  ///    cond_br X B1 B2
+  /// is transformed to
+  /// B0:
+  ///    cond_br X B1 B2
+  void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
+  void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
+  void removeSuccessor(MachineBasicBlock *MBB);
+  MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
+      MachineBasicBlock *PredMBB);
+  void migrateInstruction(MachineBasicBlock *SrcMBB,
+      MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
+  void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
+  void retireBlock(MachineBasicBlock *MBB);
+  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr);
+
+  MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
+  /// This is work around solution for findNearestCommonDominator not available
+  /// to post dom a proper fix should go to Dominators.h.
+  MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1,
+      MachineBasicBlock *MBB2);
+
+private:
+  MBBInfoMap BlockInfoMap;
+  LoopLandInfoMap LLInfoMap;
+  std::map<MachineLoop *, bool> Visited;
+  MachineFunction *FuncRep;
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
+};
+
+int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
+  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+  if (It == BlockInfoMap.end())
+    return INVALIDSCCNUM;
+  return (*It).second->SccNum;
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
+    const {
+  LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
+  if (It == LLInfoMap.end())
+    return nullptr;
+  return (*It).second;
+}
+
+bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
+  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+  if (!LoopRep)
+    return false;
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  return MBB->isSuccessor(LoopHeader);
+}
+
+unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) {
+  return LoopRep ? LoopRep->getLoopDepth() : 0;
+}
+
+bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
+  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+  if (It == BlockInfoMap.end())
+    return false;
+  return (*It).second->IsRetired;
+}
+
+bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
+  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+  while (LoopRep && LoopRep->getHeader() == MBB) {
+    MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
+    if(!LoopLand)
+      return true;
+    if (!isRetiredBlock(LoopLand))
+      return true;
+    LoopRep = LoopRep->getParentLoop();
+  }
+  return false;
+}
+AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+    bool AllowSideEntry) const {
+  assert(DstMBB);
+  if (SrcMBB == DstMBB)
+    return SinglePath_InPath;
+  while (SrcMBB && SrcMBB->succ_size() == 1) {
+    SrcMBB = *SrcMBB->succ_begin();
+    if (SrcMBB == DstMBB)
+      return SinglePath_InPath;
+    if (!AllowSideEntry && SrcMBB->pred_size() > 1)
+      return Not_SinglePath;
+  }
+  if (SrcMBB && SrcMBB->succ_size()==0)
+    return SinglePath_NotInPath;
+  return Not_SinglePath;
+}
+
+int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
+    MBBVector::const_iterator E) const {
+  int Count = 0;
+  while (It != E) {
+    if (!isRetiredBlock(*It))
+      ++Count;
+    ++It;
+  }
+  return Count;
+}
+
+bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
+  unsigned BlockSizeThreshold = 30;
+  unsigned CloneInstrThreshold = 100;
+  bool MultiplePreds = MBB && (MBB->pred_size() > 1);
+
+  if(!MultiplePreds)
+    return false;
+  unsigned BlkSize = MBB->size();
+  return ((BlkSize > BlockSizeThreshold) &&
+      (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
+}
+
+void AMDGPUCFGStructurizer::reversePredicateSetter(
+    MachineBasicBlock::iterator I) {
+  while (I--) {
+    if (I->getOpcode() == AMDGPU::PRED_X) {
+      switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
+      case OPCODE_IS_ZERO_INT:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_NOT_ZERO_INT);
+        return;
+      case OPCODE_IS_NOT_ZERO_INT:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_ZERO_INT);
+        return;
+      case OPCODE_IS_ZERO:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_NOT_ZERO);
+        return;
+      case OPCODE_IS_NOT_ZERO:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_ZERO);
+        return;
+      default:
+        llvm_unreachable("PRED_X Opcode invalid!");
+      }
+    }
+  }
+}
+
+void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
+    int NewOpcode, DebugLoc DL) {
+ MachineInstr *MI = MBB->getParent()
+    ->CreateMachineInstr(TII->get(NewOpcode), DL);
+  MBB->push_back(MI);
+  //assume the instruction doesn't take any reg operand ...
+  SHOWNEWINSTR(MI);
+}
+
+MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
+    int NewOpcode, DebugLoc DL) {
+  MachineInstr *MI =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
+  if (MBB->begin() != MBB->end())
+    MBB->insert(MBB->begin(), MI);
+  else
+    MBB->push_back(MI);
+  SHOWNEWINSTR(MI);
+  return MI;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
+    MachineBasicBlock::iterator I, int NewOpcode) {
+  MachineInstr *OldMI = &(*I);
+  MachineBasicBlock *MBB = OldMI->getParent();
+  MachineInstr *NewMBB =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
+  MBB->insert(I, NewMBB);
+  //assume the instruction doesn't take any reg operand ...
+  SHOWNEWINSTR(NewMBB);
+  return NewMBB;
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchBefore(
+    MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) {
+  MachineInstr *OldMI = &(*I);
+  MachineBasicBlock *MBB = OldMI->getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+  MBB->insert(I, NewMI);
+  MachineInstrBuilder MIB(*MF, NewMI);
+  MIB.addReg(OldMI->getOperand(1).getReg(), false);
+  SHOWNEWINSTR(NewMI);
+  //erase later oldInstr->eraseFromParent();
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk,
+    MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
+    DebugLoc DL) {
+  MachineFunction *MF = blk->getParent();
+  MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+  //insert before
+  blk->insert(I, NewInstr);
+  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+  SHOWNEWINSTR(NewInstr);
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB,
+    int NewOpcode, int RegNum) {
+  MachineFunction *MF = MBB->getParent();
+  MachineInstr *NewInstr =
+    MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
+  MBB->push_back(NewInstr);
+  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+  SHOWNEWINSTR(NewInstr);
+}
+
+int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+  case AMDGPU::BRANCH_COND_i32:
+  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+  case AMDGPU::BRANCH_COND_i32:
+  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
+  default: llvm_unreachable("internal error");
+  };
+  return -1;
+}
+
+int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
+  return MI->getOperand(0).getMBB();
+}
+
+void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
+    MachineBasicBlock *MBB) {
+  MI->getOperand(0).setMBB(MBB);
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
+    MachineInstr *MI) {
+  assert(MBB->succ_size() == 2);
+  MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+  MachineBasicBlock::succ_iterator It = MBB->succ_begin();
+  MachineBasicBlock::succ_iterator Next = It;
+  ++Next;
+  return (*It == TrueBranch) ? *Next : *It;
+}
+
+bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+    case AMDGPU::JUMP_COND:
+    case AMDGPU::BRANCH_COND_i32:
+    case AMDGPU::BRANCH_COND_f32: return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
+bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AMDGPU::JUMP:
+  case AMDGPU::BRANCH:
+    return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
+DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
+  //get DebugLoc from the first MachineBasicBlock instruction with debug info
+  DebugLoc DL;
+  for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end();
+      ++It) {
+    MachineInstr *instr = &(*It);
+    if (instr->getDebugLoc())
+      DL = instr->getDebugLoc();
+  }
+  return DL;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  MachineInstr *MI = &*It;
+  if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
+    return MI;
+  return nullptr;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
+    MachineBasicBlock *MBB) {
+  for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
+      It != E; ++It) {
+    // FIXME: Simplify
+    MachineInstr *MI = &*It;
+    if (MI) {
+      if (isCondBranch(MI) || isUncondBranch(MI))
+        return MI;
+      else if (!TII->isMov(MI->getOpcode()))
+        break;
+    }
+  }
+  return nullptr;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  if (It != MBB->rend()) {
+    MachineInstr *instr = &(*It);
+    if (instr->getOpcode() == AMDGPU::RETURN)
+      return instr;
+  }
+  return nullptr;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  if (It != MBB->rend()) {
+    MachineInstr *MI = &(*It);
+    if (MI->getOpcode() == AMDGPU::CONTINUE)
+      return MI;
+  }
+  return nullptr;
+}
+
+bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
+  MachineInstr *MI = getReturnInstr(MBB);
+  bool IsReturn = (MBB->succ_size() == 0);
+  if (MI)
+    assert(IsReturn);
+  else if (IsReturn)
+    DEBUG(
+      dbgs() << "BB" << MBB->getNumber()
+             <<" is return block without RETURN instr\n";);
+  return  IsReturn;
+}
+
+void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
+    MachineBasicBlock *SrcMBB) {
+  for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(),
+       iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It)
+    DstMBB->addSuccessor(*It);  // *iter's predecessor is also taken care of
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
+  MachineFunction *Func = MBB->getParent();
+  MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
+  Func->push_back(NewMBB);  //insert to function
+  for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end();
+      It != E; ++It) {
+    MachineInstr *MI = Func->CloneMachineInstr(It);
+    NewMBB->push_back(MI);
+  }
+  return NewMBB;
+}
+
+void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
+    MachineBasicBlock *NewBlk) {
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
+  if (BranchMI && isCondBranch(BranchMI) &&
+      getTrueBranch(BranchMI) == OldMBB)
+    setTrueBranch(BranchMI, NewBlk);
+}
+
+void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
+  assert((!MBB->getParent()->getJumpTableInfo()
+          || MBB->getParent()->getJumpTableInfo()->isEmpty())
+         && "found a jump table");
+
+   //collect continue right before endloop
+   SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
+   MachineBasicBlock::iterator Pre = MBB->begin();
+   MachineBasicBlock::iterator E = MBB->end();
+   MachineBasicBlock::iterator It = Pre;
+   while (It != E) {
+     if (Pre->getOpcode() == AMDGPU::CONTINUE
+         && It->getOpcode() == AMDGPU::ENDLOOP)
+       ContInstr.push_back(Pre);
+     Pre = It;
+     ++It;
+   }
+
+   //delete continue right before endloop
+   for (unsigned i = 0; i < ContInstr.size(); ++i)
+      ContInstr[i]->eraseFromParent();
+
+   // TODO to fix up jump table so later phase won't be confused.  if
+   // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
+   // there isn't such an interface yet.  alternatively, replace all the other
+   // blocks in the jump table with the entryBlk //}
+
+}
+
+
+bool AMDGPUCFGStructurizer::prepare() {
+  bool Changed = false;
+
+  //FIXME: if not reducible flow graph, make it so ???
+
+  DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
+
+  orderBlocks(FuncRep);
+
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
+
+  // Add an ExitBlk to loop that don't have one
+  for (MachineLoopInfo::iterator It = MLI->begin(),
+       E = MLI->end(); It != E; ++It) {
+    MachineLoop *LoopRep = (*It);
+    MBBVector ExitingMBBs;
+    LoopRep->getExitingBlocks(ExitingMBBs);
+
+    if (ExitingMBBs.size() == 0) {
+      MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
+      if (DummyExitBlk)
+        RetBlks.push_back(DummyExitBlk);
+    }
+  }
+
+  // Remove unconditional branch instr.
+  // Add dummy exit block iff there are multiple returns.
+  for (SmallVectorImpl<MachineBasicBlock *>::const_iterator
+       It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    removeUnconditionalBranch(MBB);
+    removeRedundantConditionalBranch(MBB);
+    if (isReturnBlock(MBB)) {
+      RetBlks.push_back(MBB);
+    }
+    assert(MBB->succ_size() <= 2);
+  }
+
+  if (RetBlks.size() >= 2) {
+    addDummyExitBlock(RetBlks);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool AMDGPUCFGStructurizer::run() {
+
+  //Assume reducible CFG...
+  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
+
+#ifdef STRESSTEST
+  //Use the worse block ordering to test the algorithm.
+  ReverseVector(orderedBlks);
+#endif
+
+  DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
+  int NumIter = 0;
+  bool Finish = false;
+  MachineBasicBlock *MBB;
+  bool MakeProgress = false;
+  int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
+                                        OrderedBlks.end());
+
+  do {
+    ++NumIter;
+    DEBUG(
+      dbgs() << "numIter = " << NumIter
+             << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
+    );
+
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
+        OrderedBlks.begin();
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
+        OrderedBlks.end();
+
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
+        It;
+    MachineBasicBlock *SccBeginMBB = nullptr;
+    int SccNumBlk = 0;  // The number of active blocks, init to a
+                        // maximum possible number.
+    int SccNumIter;     // Number of iteration in this SCC.
+
+    while (It != E) {
+      MBB = *It;
+
+      if (!SccBeginMBB) {
+        SccBeginIter = It;
+        SccBeginMBB = MBB;
+        SccNumIter = 0;
+        SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
+        DEBUG(
+              dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
+              dbgs() << "\n";
+        );
+      }
+
+      if (!isRetiredBlock(MBB))
+        patternMatch(MBB);
+
+      ++It;
+
+      bool ContNextScc = true;
+      if (It == E
+          || getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
+        // Just finish one scc.
+        ++SccNumIter;
+        int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
+        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
+          DEBUG(
+            dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
+                   << ", sccNumIter = " << SccNumIter;
+            dbgs() << "doesn't make any progress\n";
+          );
+          ContNextScc = true;
+        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
+          SccNumBlk = sccRemainedNumBlk;
+          It = SccBeginIter;
+          ContNextScc = false;
+          DEBUG(
+            dbgs() << "repeat processing SCC" << getSCCNum(MBB)
+                   << "sccNumIter = " << SccNumIter << '\n';
+          );
+        } else {
+          // Finish the current scc.
+          ContNextScc = true;
+        }
+      } else {
+        // Continue on next component in the current scc.
+        ContNextScc = false;
+      }
+
+      if (ContNextScc)
+        SccBeginMBB = nullptr;
+    } //while, "one iteration" over the function.
+
+    MachineBasicBlock *EntryMBB =
+        GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
+    if (EntryMBB->succ_size() == 0) {
+      Finish = true;
+      DEBUG(
+        dbgs() << "Reduce to one block\n";
+      );
+    } else {
+      int NewnumRemainedBlk
+        = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
+      // consider cloned blocks ??
+      if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
+        MakeProgress = true;
+        NumRemainedBlk = NewnumRemainedBlk;
+      } else {
+        MakeProgress = false;
+        DEBUG(
+          dbgs() << "No progress\n";
+        );
+      }
+    }
+  } while (!Finish && MakeProgress);
+
+  // Misc wrap up to maintain the consistency of the Function representation.
+  wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
+
+  // Detach retired Block, release memory.
+  for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
+      It != E; ++It) {
+    if ((*It).second && (*It).second->IsRetired) {
+      assert(((*It).first)->getNumber() != -1);
+      DEBUG(
+        dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
+      );
+      (*It).first->eraseFromParent();  //Remove from the parent Function.
+    }
+    delete (*It).second;
+  }
+  BlockInfoMap.clear();
+  LLInfoMap.clear();
+
+  if (!Finish) {
+    DEBUG(FuncRep->viewCFG());
+    llvm_unreachable("IRREDUCIBLE_CFG");
+  }
+
+  return true;
+}
+
+
+
+void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
+  int SccNum = 0;
+  MachineBasicBlock *MBB;
+  for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
+       ++It, ++SccNum) {
+    const std::vector<MachineBasicBlock *> &SccNext = *It;
+    for (std::vector<MachineBasicBlock *>::const_iterator
+         blockIter = SccNext.begin(), blockEnd = SccNext.end();
+         blockIter != blockEnd; ++blockIter) {
+      MBB = *blockIter;
+      OrderedBlks.push_back(MBB);
+      recordSccnum(MBB, SccNum);
+    }
+  }
+
+  //walk through all the block in func to check for unreachable
+  typedef GraphTraits<MachineFunction *> GTM;
+  MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
+  for (; It != E; ++It) {
+    MachineBasicBlock *MBB = &(*It);
+    SccNum = getSCCNum(MBB);
+    if (SccNum == INVALIDSCCNUM)
+      dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
+  }
+}
+
+int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
+  int NumMatch = 0;
+  int CurMatch;
+
+  DEBUG(
+        dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
+  );
+
+  while ((CurMatch = patternMatchGroup(MBB)) > 0)
+    NumMatch += CurMatch;
+
+  DEBUG(
+        dbgs() << "End patternMatch BB" << MBB->getNumber()
+      << ", numMatch = " << NumMatch << "\n";
+  );
+
+  return NumMatch;
+}
+
+int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
+  int NumMatch = 0;
+  NumMatch += loopendPatternMatch();
+  NumMatch += serialPatternMatch(MBB);
+  NumMatch += ifPatternMatch(MBB);
+  return NumMatch;
+}
+
+
+int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
+  if (MBB->succ_size() != 1)
+    return 0;
+
+  MachineBasicBlock *childBlk = *MBB->succ_begin();
+  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
+    return 0;
+
+  mergeSerialBlock(MBB, childBlk);
+  ++numSerialPatternMatch;
+  return 1;
+}
+
+int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
+  //two edges
+  if (MBB->succ_size() != 2)
+    return 0;
+  if (hasBackEdge(MBB))
+    return 0;
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+  if (!BranchMI)
+    return 0;
+
+  assert(isCondBranch(BranchMI));
+  int NumMatch = 0;
+
+  MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
+  NumMatch += serialPatternMatch(TrueMBB);
+  NumMatch += ifPatternMatch(TrueMBB);
+  MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
+  NumMatch += serialPatternMatch(FalseMBB);
+  NumMatch += ifPatternMatch(FalseMBB);
+  MachineBasicBlock *LandBlk;
+  int Cloned = 0;
+
+  assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
+  // TODO: Simplify
+  if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
+    && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
+    // Diamond pattern
+    LandBlk = *TrueMBB->succ_begin();
+  } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
+    // Triangle pattern, false is empty
+    LandBlk = FalseMBB;
+    FalseMBB = nullptr;
+  } else if (FalseMBB->succ_size() == 1
+             && *FalseMBB->succ_begin() == TrueMBB) {
+    // Triangle pattern, true is empty
+    // We reverse the predicate to make a triangle, empty false pattern;
+    std::swap(TrueMBB, FalseMBB);
+    reversePredicateSetter(MBB->end());
+    LandBlk = FalseMBB;
+    FalseMBB = nullptr;
+  } else if (FalseMBB->succ_size() == 1
+             && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
+    LandBlk = *FalseMBB->succ_begin();
+  } else if (TrueMBB->succ_size() == 1
+    && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
+    LandBlk = *TrueMBB->succ_begin();
+  } else {
+    return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
+  }
+
+  // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
+  // new BB created for landBlk==NULL may introduce new challenge to the
+  // reduction process.
+  if (LandBlk &&
+      ((TrueMBB && TrueMBB->pred_size() > 1)
+      || (FalseMBB && FalseMBB->pred_size() > 1))) {
+     Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
+  }
+
+  if (TrueMBB && TrueMBB->pred_size() > 1) {
+    TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
+    ++Cloned;
+  }
+
+  if (FalseMBB && FalseMBB->pred_size() > 1) {
+    FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
+    ++Cloned;
+  }
+
+  mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
+
+  ++numIfPatternMatch;
+
+  numClonedBlock += Cloned;
+
+  return 1 + Cloned + NumMatch;
+}
+
+int AMDGPUCFGStructurizer::loopendPatternMatch() {
+  std::deque<MachineLoop *> NestedLoops;
+  for (auto &It: *MLI)
+    for (MachineLoop *ML : depth_first(It))
+      NestedLoops.push_front(ML);
+
+  if (NestedLoops.size() == 0)
+    return 0;
+
+  // Process nested loop outside->inside (we did push_front),
+  // so "continue" to a outside loop won't be mistaken as "break"
+  // of the current loop.
+  int Num = 0;
+  for (MachineLoop *ExaminedLoop : NestedLoops) {
+    if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
+      continue;
+    DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
+    int NumBreak = mergeLoop(ExaminedLoop);
+    if (NumBreak == -1)
+      break;
+    Num += NumBreak;
+  }
+  return Num;
+}
+
+int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  MBBVector ExitingMBBs;
+  LoopRep->getExitingBlocks(ExitingMBBs);
+  assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
+  DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
+  // We assume a single ExitBlk
+  MBBVector ExitBlks;
+  LoopRep->getExitBlocks(ExitBlks);
+  SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
+  for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
+    ExitBlkSet.insert(ExitBlks[i]);
+  assert(ExitBlkSet.size() == 1);
+  MachineBasicBlock *ExitBlk = *ExitBlks.begin();
+  assert(ExitBlk && "Loop has several exit block");
+  MBBVector LatchBlks;
+  typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits;
+  InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader),
+      PE = InvMBBTraits::child_end(LoopHeader);
+  for (; PI != PE; PI++) {
+    if (LoopRep->contains(*PI))
+      LatchBlks.push_back(*PI);
+  }
+
+  for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
+    mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
+  for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
+    settleLoopcontBlock(LatchBlks[i], LoopHeader);
+  int Match = 0;
+  do {
+    Match = 0;
+    Match += serialPatternMatch(LoopHeader);
+    Match += ifPatternMatch(LoopHeader);
+  } while (Match > 0);
+  mergeLooplandBlock(LoopHeader, ExitBlk);
+  MachineLoop *ParentLoop = LoopRep->getParentLoop();
+  if (ParentLoop)
+    MLI->changeLoopFor(LoopHeader, ParentLoop);
+  else
+    MLI->removeBlock(LoopHeader);
+  Visited[LoopRep] = true;
+  return 1;
+}
+
+int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep,
+    MachineBasicBlock *LoopHeader) {
+  int NumCont = 0;
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> ContMBB;
+  typedef GraphTraits<Inverse<MachineBasicBlock *> > GTIM;
+  GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader),
+      E = GTIM::child_end(LoopHeader);
+  for (; It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    if (LoopRep->contains(MBB)) {
+      handleLoopcontBlock(MBB, MLI->getLoopFor(MBB),
+                          LoopHeader, LoopRep);
+      ContMBB.push_back(MBB);
+      ++NumCont;
+    }
+  }
+
+  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(),
+      E = ContMBB.end(); It != E; ++It) {
+    (*It)->removeSuccessor(LoopHeader);
+  }
+
+  numLoopcontPatternMatch += NumCont;
+
+  return NumCont;
+}
+
+
+bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
+    MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
+  if (Src1MBB->succ_size() == 0) {
+    MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
+    if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
+      MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
+      if (TheEntry) {
+        DEBUG(
+          dbgs() << "isLoopContBreakBlock yes src1 = BB"
+                 << Src1MBB->getNumber()
+                 << " src2 = BB" << Src2MBB->getNumber() << "\n";
+        );
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+  int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
+  if (Num == 0) {
+    DEBUG(
+      dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
+    );
+    Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
+  }
+  return Num;
+}
+
+int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+  int Num = 0;
+  MachineBasicBlock *DownBlk;
+
+  //trueBlk could be the common post dominator
+  DownBlk = TrueMBB;
+
+  DEBUG(
+    dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
+           << " true = BB" << TrueMBB->getNumber()
+           << ", numSucc=" << TrueMBB->succ_size()
+           << " false = BB" << FalseMBB->getNumber() << "\n";
+  );
+
+  while (DownBlk) {
+    DEBUG(
+      dbgs() << "check down = BB" << DownBlk->getNumber();
+    );
+
+    if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
+      DEBUG(
+        dbgs() << " working\n";
+      );
+
+      Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
+      Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
+
+      numClonedBlock += Num;
+      Num += serialPatternMatch(*HeadMBB->succ_begin());
+      Num += serialPatternMatch(*std::next(HeadMBB->succ_begin()));
+      Num += ifPatternMatch(HeadMBB);
+      assert(Num > 0);
+
+      break;
+    }
+    DEBUG(
+      dbgs() << " not working\n";
+    );
+    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
+  } // walk down the postDomTree
+
+  return Num;
+}
+
+void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
+    MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
+    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
+  dbgs() << "head = BB" << HeadMBB->getNumber()
+         << " size = " << HeadMBB->size();
+  if (Detail) {
+    dbgs() << "\n";
+    HeadMBB->print(dbgs());
+    dbgs() << "\n";
+  }
+
+  if (TrueMBB) {
+    dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
+           << TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      TrueMBB->print(dbgs());
+      dbgs() << "\n";
+    }
+  }
+  if (FalseMBB) {
+    dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
+           << FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      FalseMBB->print(dbgs());
+      dbgs() << "\n";
+    }
+  }
+  if (LandMBB) {
+    dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
+           << LandMBB->size() << " numPred = " << LandMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      LandMBB->print(dbgs());
+      dbgs() << "\n";
+    }
+  }
+
+    dbgs() << "\n";
+}
+
+int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+    MachineBasicBlock **LandMBBPtr) {
+  bool MigrateTrue = false;
+  bool MigrateFalse = false;
+
+  MachineBasicBlock *LandBlk = *LandMBBPtr;
+
+  assert((!TrueMBB || TrueMBB->succ_size() <= 1)
+         && (!FalseMBB || FalseMBB->succ_size() <= 1));
+
+  if (TrueMBB == FalseMBB)
+    return 0;
+
+  MigrateTrue = needMigrateBlock(TrueMBB);
+  MigrateFalse = needMigrateBlock(FalseMBB);
+
+  if (!MigrateTrue && !MigrateFalse)
+    return 0;
+
+  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
+  // have more than one predecessors.  without doing this, its predecessor
+  // rather than headBlk will have undefined value in initReg.
+  if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
+    MigrateTrue = true;
+  if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
+    MigrateFalse = true;
+
+  DEBUG(
+    dbgs() << "before improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
+  );
+
+  // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
+  //
+  // new: headBlk => if () {initReg = 1; org trueBlk branch} else
+  //      {initReg = 0; org falseBlk branch }
+  //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
+  //      => org landBlk
+  //      if landBlk->pred_size() > 2, put the about if-else inside
+  //      if (initReg !=2) {...}
+  //
+  // add initReg = initVal to headBlk
+
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+  if (!MigrateTrue || !MigrateFalse) {
+    // XXX: We have an opportunity here to optimize the "branch into if" case
+    // here.  Branch into if looks like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \           |
+    // diamond_false        diamond_true
+    //             \      /
+    //               done
+    //
+    // The diamond_head block begins the "if" and the diamond_true block
+    // is the block being "branched into".
+    //
+    // If MigrateTrue is true, then TrueBB is the block being "branched into"
+    // and if MigrateFalse is true, then FalseBB is the block being
+    // "branched into"
+    // 
+    // Here is the pseudo code for how I think the optimization should work:
+    // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
+    // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
+    // 3. Move the branch instruction from diamond_head into its own basic
+    //    block (new_block).
+    // 4. Add an unconditional branch from diamond_head to new_block
+    // 5. Replace the branch instruction in branch_from with an unconditional
+    //    branch to new_block.  If branch_from has multiple predecessors, then
+    //    we need to replace the True/False block in the branch
+    //    instruction instead of replacing it.
+    // 6. Change the condition of the branch instruction in new_block from
+    //    COND to (COND || GPR0)
+    //
+    // In order insert these MOV instruction, we will need to use the
+    // RegisterScavenger.  Usually liveness stops being tracked during
+    // the late machine optimization passes, however if we implement
+    // bool TargetRegisterInfo::requiresRegisterScavenging(
+    //                                                const MachineFunction &MF)
+    // and have it return true, liveness will be tracked correctly 
+    // by generic optimization passes.  We will also need to make sure that
+    // all of our target-specific passes that run after regalloc and before
+    // the CFGStructurizer track liveness and we will need to modify this pass
+    // to correctly track liveness.
+    //
+    // After the above changes, the new CFG should look like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //                       \     /
+    //                      new_block
+    //                      /      |
+    //         diamond_false        diamond_true
+    //                      \      /
+    //                        done
+    //
+    // Without this optimization, we are forced to duplicate the diamond_true
+    // block and we will end up with a CFG like this:
+    //
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \                   |
+    // diamond_false        diamond_true      diamond_true (duplicate)
+    //             \      /                   |
+    //               done --------------------|
+    //
+    // Duplicating diamond_true can be very costly especially if it has a
+    // lot of instructions.
+    return 0;
+  }
+
+  int NumNewBlk = 0;
+
+  bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
+
+  //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
+  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
+
+  if (LandBlkHasOtherPred) {
+    llvm_unreachable("Extra register needed to handle CFG");
+    unsigned CmpResReg =
+      HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+    llvm_unreachable("Extra compare instruction needed to handle CFG");
+    insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
+        CmpResReg, DebugLoc());
+  }
+
+  // XXX: We are running this after RA, so creating virtual registers will
+  // cause an assertion failure in the PostRA scheduling pass.
+  unsigned InitReg =
+    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+  insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
+      DebugLoc());
+
+  if (MigrateTrue) {
+    migrateInstruction(TrueMBB, LandBlk, I);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 1).
+    llvm_unreachable("Extra register needed to handle CFG");
+  }
+  insertInstrBefore(I, AMDGPU::ELSE);
+
+  if (MigrateFalse) {
+    migrateInstruction(FalseMBB, LandBlk, I);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 0)
+    llvm_unreachable("Extra register needed to handle CFG");
+  }
+
+  if (LandBlkHasOtherPred) {
+    // add endif
+    insertInstrBefore(I, AMDGPU::ENDIF);
+
+    // put initReg = 2 to other predecessors of landBlk
+    for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
+         PE = LandBlk->pred_end(); PI != PE; ++PI) {
+      MachineBasicBlock *MBB = *PI;
+      if (MBB != TrueMBB && MBB != FalseMBB)
+        llvm_unreachable("Extra register needed to handle CFG");
+    }
+  }
+  DEBUG(
+    dbgs() << "result from improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
+  );
+
+  // update landBlk
+  *LandMBBPtr = LandBlk;
+
+  return NumNewBlk;
+}
+
+void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB,
+    MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
+    MachineLoop *ContLoop) {
+  DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber()
+               << " header = BB" << ContMBB->getNumber() << "\n";
+        dbgs() << "Trying to continue loop-depth = "
+               << getLoopDepth(ContLoop)
+               << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";);
+  settleLoopcontBlock(ContingMBB, ContMBB);
+}
+
+void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
+    MachineBasicBlock *SrcMBB) {
+  DEBUG(
+    dbgs() << "serialPattern BB" << DstMBB->getNumber()
+           << " <= BB" << SrcMBB->getNumber() << "\n";
+  );
+  DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
+
+  DstMBB->removeSuccessor(SrcMBB);
+  cloneSuccessorList(DstMBB, SrcMBB);
+
+  removeSuccessor(SrcMBB);
+  MLI->removeBlock(SrcMBB);
+  retireBlock(SrcMBB);
+}
+
+void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
+    MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
+  assert (TrueMBB);
+  DEBUG(
+    dbgs() << "ifPattern BB" << MBB->getNumber();
+    dbgs() << "{  ";
+    if (TrueMBB) {
+      dbgs() << "BB" << TrueMBB->getNumber();
+    }
+    dbgs() << "  } else ";
+    dbgs() << "{  ";
+    if (FalseMBB) {
+      dbgs() << "BB" << FalseMBB->getNumber();
+    }
+    dbgs() << "  }\n ";
+    dbgs() << "landBlock: ";
+    if (!LandMBB) {
+      dbgs() << "NULL";
+    } else {
+      dbgs() << "BB" << LandMBB->getNumber();
+    }
+    dbgs() << "\n";
+  );
+
+  int OldOpcode = BranchMI->getOpcode();
+  DebugLoc BranchDL = BranchMI->getDebugLoc();
+
+//    transform to
+//    if cond
+//       trueBlk
+//    else
+//       falseBlk
+//    endif
+//    landBlk
+
+  MachineBasicBlock::iterator I = BranchMI;
+  insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
+      BranchDL);
+
+  if (TrueMBB) {
+    MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
+    MBB->removeSuccessor(TrueMBB);
+    if (LandMBB && TrueMBB->succ_size()!=0)
+      TrueMBB->removeSuccessor(LandMBB);
+    retireBlock(TrueMBB);
+    MLI->removeBlock(TrueMBB);
+  }
+
+  if (FalseMBB) {
+    insertInstrBefore(I, AMDGPU::ELSE);
+    MBB->splice(I, FalseMBB, FalseMBB->begin(),
+                   FalseMBB->end());
+    MBB->removeSuccessor(FalseMBB);
+    if (LandMBB && FalseMBB->succ_size() != 0)
+      FalseMBB->removeSuccessor(LandMBB);
+    retireBlock(FalseMBB);
+    MLI->removeBlock(FalseMBB);
+  }
+  insertInstrBefore(I, AMDGPU::ENDIF);
+
+  BranchMI->eraseFromParent();
+
+  if (LandMBB && TrueMBB && FalseMBB)
+    MBB->addSuccessor(LandMBB);
+
+}
+
+void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
+    MachineBasicBlock *LandMBB) {
+  DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
+               << " land = BB" << LandMBB->getNumber() << "\n";);
+
+  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
+  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
+  DstBlk->addSuccessor(LandMBB);
+  DstBlk->removeSuccessor(DstBlk);
+}
+
+
+void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+    MachineBasicBlock *LandMBB) {
+  DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
+               << " land = BB" << LandMBB->getNumber() << "\n";);
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
+  assert(BranchMI && isCondBranch(BranchMI));
+  DebugLoc DL = BranchMI->getDebugLoc();
+  MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
+  MachineBasicBlock::iterator I = BranchMI;
+  if (TrueBranch != LandMBB)
+    reversePredicateSetter(I);
+  insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
+  insertInstrBefore(I, AMDGPU::BREAK);
+  insertInstrBefore(I, AMDGPU::ENDIF);
+  //now branchInst can be erase safely
+  BranchMI->eraseFromParent();
+  //now take care of successors, retire blocks
+  ExitingMBB->removeSuccessor(LandMBB);
+}
+
+void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+    MachineBasicBlock *ContMBB) {
+  DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
+               << ContingMBB->getNumber()
+               << ", cont = BB" << ContMBB->getNumber() << "\n";);
+
+  MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
+  if (MI) {
+    assert(isCondBranch(MI));
+    MachineBasicBlock::iterator I = MI;
+    MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+    int OldOpcode = MI->getOpcode();
+    DebugLoc DL = MI->getDebugLoc();
+
+    bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
+
+    if (!UseContinueLogical) {
+      int BranchOpcode =
+          TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
+          getBranchZeroOpcode(OldOpcode);
+      insertCondBranchBefore(I, BranchOpcode, DL);
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
+      insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
+    } else {
+      int BranchOpcode =
+          TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
+          getContinueZeroOpcode(OldOpcode);
+      insertCondBranchBefore(I, BranchOpcode, DL);
+    }
+
+    MI->eraseFromParent();
+  } else {
+    // if we've arrived here then we've already erased the branch instruction
+    // travel back up the basic block to see the last reference of our debug
+    // location we've just inserted that reference here so it should be
+    // representative insertEnd to ensure phi-moves, if exist, go before the
+    // continue-instr.
+    insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
+        getLastDebugLocInBB(ContingMBB));
+  }
+}
+
+int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
+  int Cloned = 0;
+  assert(PreMBB->isSuccessor(SrcMBB));
+  while (SrcMBB && SrcMBB != DstMBB) {
+    assert(SrcMBB->succ_size() == 1);
+    if (SrcMBB->pred_size() > 1) {
+      SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
+      ++Cloned;
+    }
+
+    PreMBB = SrcMBB;
+    SrcMBB = *SrcMBB->succ_begin();
+  }
+
+  return Cloned;
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
+    MachineBasicBlock *PredMBB) {
+  assert(PredMBB->isSuccessor(MBB) &&
+         "succBlk is not a prececessor of curBlk");
+
+  MachineBasicBlock *CloneMBB = clone(MBB);  //clone instructions
+  replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
+  //srcBlk, oldBlk, newBlk
+
+  PredMBB->removeSuccessor(MBB);
+  PredMBB->addSuccessor(CloneMBB);
+
+  // add all successor to cloneBlk
+  cloneSuccessorList(CloneMBB, MBB);
+
+  numClonedInstr += MBB->size();
+
+  DEBUG(
+    dbgs() << "Cloned block: " << "BB"
+           << MBB->getNumber() << "size " << MBB->size() << "\n";
+  );
+
+  SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
+
+  return CloneMBB;
+}
+
+void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
+    MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator SpliceEnd;
+  //look for the input branchinstr, not the AMDGPU branchinstr
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
+  if (!BranchMI) {
+    DEBUG(
+      dbgs() << "migrateInstruction don't see branch instr\n" ;
+    );
+    SpliceEnd = SrcMBB->end();
+  } else {
+    DEBUG(
+      dbgs() << "migrateInstruction see branch instr\n" ;
+      BranchMI->dump();
+    );
+    SpliceEnd = BranchMI;
+  }
+  DEBUG(
+    dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
+      << "srcSize = " << SrcMBB->size() << "\n";
+  );
+
+  //splice insert before insertPos
+  DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
+
+  DEBUG(
+    dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
+      << "srcSize = " << SrcMBB->size() << "\n";
+  );
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+
+  if (!LoopHeader || !LoopLatch)
+    return nullptr;
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
+  // Is LoopRep an infinite loop ?
+  if (!BranchMI || !isUncondBranch(BranchMI))
+    return nullptr;
+
+  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+  FuncRep->push_back(DummyExitBlk);  //insert to function
+  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
+  DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
+  MachineBasicBlock::iterator I = BranchMI;
+  unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC);
+  llvm_unreachable("Extra register needed to handle CFG");
+  MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32);
+  MachineInstrBuilder MIB(*FuncRep, NewMI);
+  MIB.addMBB(LoopHeader);
+  MIB.addReg(ImmReg, false);
+  SHOWNEWINSTR(NewMI);
+  BranchMI->eraseFromParent();
+  LoopLatch->addSuccessor(DummyExitBlk);
+
+  return DummyExitBlk;
+}
+
+void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
+  MachineInstr *BranchMI;
+
+  // I saw two unconditional branch in one basic block in example
+  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
+  while ((BranchMI = getLoopendBlockBranchInstr(MBB))
+          && isUncondBranch(BranchMI)) {
+    DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump(););
+    BranchMI->eraseFromParent();
+  }
+}
+
+void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
+    MachineBasicBlock *MBB) {
+  if (MBB->succ_size() != 2)
+    return;
+  MachineBasicBlock *MBB1 = *MBB->succ_begin();
+  MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin());
+  if (MBB1 != MBB2)
+    return;
+
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+  assert(BranchMI && isCondBranch(BranchMI));
+  DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump(););
+  BranchMI->eraseFromParent();
+  SHOWNEWBLK(MBB1, "Removing redundant successor");
+  MBB->removeSuccessor(MBB1);
+}
+
+void AMDGPUCFGStructurizer::addDummyExitBlock(
+    SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
+  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+  FuncRep->push_back(DummyExitBlk);  //insert to function
+  insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
+
+  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
+       E = RetMBB.end(); It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    MachineInstr *MI = getReturnInstr(MBB);
+    if (MI)
+      MI->eraseFromParent();
+    MBB->addSuccessor(DummyExitBlk);
+    DEBUG(
+      dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
+             << " successors\n";
+    );
+  }
+  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
+}
+
+void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
+  while (MBB->succ_size())
+    MBB->removeSuccessor(*MBB->succ_begin());
+}
+
+void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
+    int SccNum) {
+  BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
+  if (!srcBlkInfo)
+    srcBlkInfo = new BlockInformation();
+  srcBlkInfo->SccNum = SccNum;
+}
+
+void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
+  DEBUG(
+        dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
+  );
+
+  BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
+
+  if (!SrcBlkInfo)
+    SrcBlkInfo = new BlockInformation();
+
+  SrcBlkInfo->IsRetired = true;
+  assert(MBB->succ_size() == 0 && MBB->pred_size() == 0
+         && "can't retire block yet");
+}
+
+void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep,
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock *&TheEntry = LLInfoMap[loopRep];
+  if (!MBB) {
+    MBB = FuncRep->CreateMachineBasicBlock();
+    FuncRep->push_back(MBB);  //insert to function
+    SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: ");
+  }
+  TheEntry = MBB;
+  DEBUG(
+    dbgs() << "setLoopLandBlock loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  landing-block = BB" << MBB->getNumber() << "\n";
+  );
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
+    MachineBasicBlock *MBB2) {
+
+  if (PDT->dominates(MBB1, MBB2))
+    return MBB1;
+  if (PDT->dominates(MBB2, MBB1))
+    return MBB2;
+
+  MachineDomTreeNode *Node1 = PDT->getNode(MBB1);
+  MachineDomTreeNode *Node2 = PDT->getNode(MBB2);
+
+  // Handle newly cloned node.
+  if (!Node1 && MBB1->succ_size() == 1)
+    return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2);
+  if (!Node2 && MBB2->succ_size() == 1)
+    return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
+
+  if (!Node1 || !Node2)
+    return nullptr;
+
+  Node1 = Node1->getIDom();
+  while (Node1) {
+    if (PDT->dominates(Node1, Node2))
+      return Node1->getBlock();
+    Node1 = Node1->getIDom();
+  }
+
+  return nullptr;
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::findNearestCommonPostDom(
+    std::set<MachineBasicBlock *> &MBBs) {
+  MachineBasicBlock *CommonDom;
+  std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin();
+  std::set<MachineBasicBlock *>::const_iterator E = MBBs.end();
+  for (CommonDom = *It; It != E && CommonDom; ++It) {
+    MachineBasicBlock *MBB = *It;
+    if (MBB != CommonDom)
+      CommonDom = findNearestCommonPostDom(MBB, CommonDom);
+  }
+
+  DEBUG(
+    dbgs() << "Common post dominator for exit blocks is ";
+    if (CommonDom)
+          dbgs() << "BB" << CommonDom->getNumber() << "\n";
+    else
+      dbgs() << "NULL\n";
+  );
+
+  return CommonDom;
+}
+
+char AMDGPUCFGStructurizer::ID = 0;
+
+} // end anonymous namespace
+
+
+INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+
+FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
+  return new AMDGPUCFGStructurizer();
+}
diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h
new file mode 100644
index 00000000000..4d3041ff3db
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -0,0 +1,704 @@
+//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file AMDKernelCodeT.h
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDKERNELCODET_H
+#define AMDKERNELCODET_H
+
+#include <cstddef>
+#include <cstdint>
+
+//---------------------------------------------------------------------------//
+// AMD Kernel Code, and its dependencies                                     //
+//---------------------------------------------------------------------------//
+
+typedef uint8_t hsa_powertwo8_t;
+typedef uint32_t hsa_ext_code_kind_t;
+typedef uint8_t hsa_ext_brig_profile8_t;
+typedef uint8_t hsa_ext_brig_machine_model8_t;
+typedef uint64_t hsa_ext_control_directive_present64_t;
+typedef uint16_t hsa_ext_exception_kind16_t;
+typedef uint32_t hsa_ext_code_kind32_t;
+
+typedef struct hsa_dim3_s {
+  uint32_t x;
+  uint32_t y;
+  uint32_t z;
+} hsa_dim3_t;
+
+/// The version of the amd_*_code_t struct. Minor versions must be
+/// backward compatible.
+typedef uint32_t amd_code_version32_t;
+enum amd_code_version_t {
+  AMD_CODE_VERSION_MAJOR = 0,
+  AMD_CODE_VERSION_MINOR = 1
+};
+
+/// The values used to define the number of bytes to use for the
+/// swizzle element size.
+enum amd_element_byte_size_t {
+  AMD_ELEMENT_2_BYTES = 0,
+  AMD_ELEMENT_4_BYTES = 1,
+  AMD_ELEMENT_8_BYTES = 2,
+  AMD_ELEMENT_16_BYTES = 3
+};
+
+/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
+/// COMPUTE_PGM_RSRC2 registers.
+typedef uint64_t amd_compute_pgm_resource_register64_t;
+
+/// Every amd_*_code_t has the following properties, which are composed of
+/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*),
+/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount
+/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0.
+///
+/// (Note that bit fields cannot be used as their layout is
+/// implementation defined in the C standard and so cannot be used to
+/// specify an ABI)
+typedef uint32_t amd_code_property32_t;
+enum amd_code_property_mask_t {
+
+  /// Enable the setup of the SGPR user data registers
+  /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
+  /// for initial register state.
+  ///
+  /// The total number of SGPRuser data registers requested must not
+  /// exceed 16. Any requests beyond 16 will be ignored.
+  ///
+  /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
+  /// SGPR user data registers enabled up to 16).
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
+
+  /// Control wave ID base counter for GDS ordered-append. Used to set
+  /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
+  /// ORDERED_APPEND_MODE also needs to be settable)
+  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10,
+  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
+
+  /// The interleave (swizzle) element size in bytes required by the
+  /// code for private memory. This must be 2, 4, 8 or 16. This value
+  /// is provided to the finalizer when it is invoked and is recorded
+  /// here. The hardware will interleave the memory requests of each
+  /// lane of a wavefront by this element size to ensure each
+  /// work-item gets a distinct memory memory location. Therefore, the
+  /// finalizer ensures that all load and store operations done to
+  /// private memory do not exceed this size. For example, if the
+  /// element size is 4 (32-bits or dword) and a 64-bit value must be
+  /// loaded, the finalizer will generate two 32-bit loads. This
+  /// ensures that the interleaving will get the the work-item
+  /// specific dword for both halves of the 64-bit value. If it just
+  /// did a 64-bit load then it would get one dword which belonged to
+  /// its own work-item, but the second dword would belong to the
+  /// adjacent lane work-item since the interleaving is in dwords.
+  ///
+  /// The value used must match the value that the runtime configures
+  /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
+  /// is generally DWORD.
+  ///
+  /// Use values from the amd_element_byte_size_t enum.
+  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11,
+  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
+  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
+
+  /// Are global memory addresses 64 bits. Must match
+  /// amd_kernel_code_t.hsail_machine_model ==
+  /// HSA_MACHINE_LARGE. Must also match
+  /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
+  /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
+  AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13,
+  AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
+  AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
+
+  /// Indicate if the generated ISA is using a dynamically sized call
+  /// stack. This can happen if calls are implemented using a call
+  /// stack and recursion, alloca or calls to indirect functions are
+  /// present. In these cases the Finalizer cannot compute the total
+  /// private segment size at compile time. In this case the
+  /// workitem_private_segment_byte_size only specifies the statically
+  /// know private segment size, and additional space must be added
+  /// for the call stack.
+  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14,
+  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
+  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
+
+  /// Indicate if code generated has support for debugging.
+  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15,
+  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
+  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT
+};
+
+/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL
+/// control directives. These control how the finalizer generates code. This
+/// struct is used both as an argument to hsaFinalizeKernel to specify values for
+/// the control directives, and is used in HsaKernelCode to record the values of
+/// the control directives that the finalize used when generating the code which
+/// either came from the finalizer argument or explicit HSAIL control
+/// directives. See the definition of the control directives in HSA Programmer's
+/// Reference Manual which also defines how the values specified as finalizer
+/// arguments have to agree with the control directives in the HSAIL code.
+typedef struct hsa_ext_control_directives_s {
+  /// This is a bit set indicating which control directives have been
+  /// specified. If the value is 0 then there are no control directives specified
+  /// and the rest of the fields can be ignored. The bits are accessed using the
+  /// hsa_ext_control_directives_present_mask_t. Any control directive that is not
+  /// enabled in this bit set must have the value of all 0s.
+  hsa_ext_control_directive_present64_t enabled_control_directives;
+
+  /// If enableBreakExceptions is not enabled then must be 0, otherwise must be
+  /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK
+  /// policy enabled. If this set is not empty then the generated code may have
+  /// lower performance than if the set is empty. If the kernel being finalized
+  /// has any enablebreakexceptions control directives, then the values specified
+  /// by this argument are unioned with the values in these control
+  /// directives. If any of the functions the kernel calls have an
+  /// enablebreakexceptions control directive, then they must be equal or a
+  /// subset of, this union.
+  hsa_ext_exception_kind16_t enable_break_exceptions;
+
+  /// If enableDetectExceptions is not enabled then must be 0, otherwise must be
+  /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT
+  /// policy enabled. If this set is not empty then the generated code may have
+  /// lower performance than if the set is empty. However, an implementation
+  /// should endeavour to make the performance impact small. If the kernel being
+  /// finalized has any enabledetectexceptions control directives, then the
+  /// values specified by this argument are unioned with the values in these
+  /// control directives. If any of the functions the kernel calls have an
+  /// enabledetectexceptions control directive, then they must be equal or a
+  /// subset of, this union.
+  hsa_ext_exception_kind16_t enable_detect_exceptions;
+
+  /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of
+  /// dynamic group segment can be allocated for a dispatch, otherwise the value
+  /// specifies the maximum number of bytes of dynamic group segment that can be
+  /// allocated for a dispatch. If the kernel being finalized has any
+  /// maxdynamicsize control directives, then the values must be the same, and
+  /// must be the same as this argument if it is enabled. This value can be used
+  /// by the finalizer to determine the maximum number of bytes of group memory
+  /// used by each work-group by adding this value to the group memory required
+  /// for all group segment variables used by the kernel and all functions it
+  /// calls, and group memory used to implement other HSAIL features such as
+  /// fbarriers and the detect exception operations. This can allow the finalizer
+  /// to determine the expected number of work-groups that can be executed by a
+  /// compute unit and allow more resources to be allocated to the work-items if
+  /// it is known that fewer work-groups can be executed due to group memory
+  /// limitations.
+  uint32_t max_dynamic_group_size;
+
+  /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater
+  /// than 0. See HSA Programmer's Reference Manual description of
+  /// maxflatgridsize control directive.
+  uint32_t max_flat_grid_size;
+
+  /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be
+  /// greater than 0. See HSA Programmer's Reference Manual description of
+  /// maxflatworkgroupsize control directive.
+  uint32_t max_flat_workgroup_size;
+
+  /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the
+  /// finalizer is free to generate ISA that may result in any number of
+  /// work-groups executing on a single compute unit. Otherwise, the finalizer
+  /// should attempt to generate ISA that will allow the specified number of
+  /// work-groups to execute on a single compute unit. This is only a hint and
+  /// can be ignored by the finalizer. If the kernel being finalized, or any of
+  /// the functions it calls, has a requested control directive, then the values
+  /// must be the same. This can be used to determine the number of resources
+  /// that should be allocated to a single work-group and work-item. For example,
+  /// a low value may allow more resources to be allocated, resulting in higher
+  /// per work-item performance, as it is known there will never be more than the
+  /// specified number of work-groups actually executing on the compute
+  /// unit. Conversely, a high value may allocate fewer resources, resulting in
+  /// lower per work-item performance, which is offset by the fact it allows more
+  /// work-groups to actually execute on the compute unit.
+  uint32_t requested_workgroups_per_cu;
+
+  /// If not enabled then all elements for Dim3 must be 0, otherwise every
+  /// element must be greater than 0. See HSA Programmer's Reference Manual
+  /// description of requiredgridsize control directive.
+  hsa_dim3_t required_grid_size;
+
+  /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be
+  /// 0, and the produced code can be dispatched with any legal work-group range
+  /// consistent with the dispatch dimensions. Otherwise, the code produced must
+  /// always be dispatched with the specified work-group range. No element of the
+  /// specified range must be 0. It must be consistent with required_dimensions
+  /// and max_flat_workgroup_size. If the kernel being finalized, or any of the
+  /// functions it calls, has a requiredworkgroupsize control directive, then the
+  /// values must be the same. Specifying a value can allow the finalizer to
+  /// optimize work-group id operations, and if the number of work-items in the
+  /// work-group is less than the WAVESIZE then barrier operations can be
+  /// optimized to just a memory fence.
+  hsa_dim3_t required_workgroup_size;
+
+  /// If requiredDim is not enabled then must be 0 and the produced kernel code
+  /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is
+  /// 1..3 and the code produced must only be dispatched with a dimension that
+  /// matches. Other values are illegal. If the kernel being finalized, or any of
+  /// the functions it calls, has a requireddimsize control directive, then the
+  /// values must be the same. This can be used to optimize the code generated to
+  /// compute the absolute and flat work-group and work-item id, and the dim
+  /// HSAIL operations.
+  uint8_t required_dim;
+
+  /// Reserved. Must be 0.
+  uint8_t reserved[75];
+} hsa_ext_control_directives_t;
+
+/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel
+/// Code Object to set up the hardware to execute the kernel dispatch.
+///
+/// Initial Kernel Register State.
+///
+/// Initial kernel register state will be set up by CP/SPI prior to the start
+/// of execution of every wavefront. This is limited by the constraints of the
+/// current hardware.
+///
+/// The order of the SGPR registers is defined, but the Finalizer can specify
+/// which ones are actually setup in the amd_kernel_code_t object using the
+/// enable_sgpr_* bit fields. The register numbers used for enabled registers
+/// are dense starting at SGPR0: the first enabled register is SGPR0, the next
+/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR
+/// number.
+///
+/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and
+/// apply to all waves of the grid. It is possible to specify more than 16 User
+/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16
+/// are actually initialized. These are then immediately followed by the System
+/// SGPRs that are set up by ADC/SPI and can have different values for each wave
+/// of the grid dispatch.
+///
+/// SGPR register initial state is defined as follows:
+///
+/// Private Segment Buffer (enable_sgpr_private_segment_buffer):
+///   Number of User SGPR registers: 4. V# that can be used, together with
+///   Scratch Wave Offset as an offset, to access the Private/Spill/Arg
+///   segments using a segment address. It must be set as follows:
+///     - Base address: of the scratch memory area used by the dispatch. It
+///       does not include the scratch wave offset. It will be the per process
+///       SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for
+///       example there may be a per pipe offset, or per AQL Queue offset).
+///     - Stride + data_format: Element Size * Index Stride (???)
+///     - Cache swizzle: ???
+///     - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for
+///       scratch)
+///     - Num records: Flat Scratch Work Item Size / Element Size (???)
+///     - Dst_sel_*: ???
+///     - Num_format: ???
+///     - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must
+///       agree with amd_kernel_code_t.privateElementSize)
+///     - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must
+///       be number of wavefront lanes for scratch, must agree with
+///       amd_kernel_code_t.wavefrontSize)
+///     - Add tid enable: 1
+///     - ATC: from SH_MEM_CONFIG.PRIVATE_ATC,
+///     - Hash_enable: ???
+///     - Heap: ???
+///     - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE
+///     - Type: 0 (a buffer) (???)
+///
+/// Dispatch Ptr (enable_sgpr_dispatch_ptr):
+///   Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet
+///   for kernel actually executing.
+///
+/// Queue Ptr (enable_sgpr_queue_ptr):
+///   Number of User SGPR registers: 2. 64 bit address of AmdQueue object for
+///   AQL queue on which the dispatch packet was queued.
+///
+/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr):
+///   Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This
+///   is directly copied from the kernargPtr in the dispatch packet. Having CP
+///   load it once avoids loading it at the beginning of every wavefront.
+///
+/// Dispatch Id (enable_sgpr_dispatch_id):
+///   Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch
+///   packet being executed.
+///
+/// Flat Scratch Init (enable_sgpr_flat_scratch_init):
+///   Number of User SGPR registers: 2. This is 2 SGPRs.
+///
+///   For CI/VI:
+///     The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE
+///     to base of memory for scratch for this dispatch. This is the same offset
+///     used in computing the Scratch Segment Buffer base address. The value of
+///     Scratch Wave Offset must be added by the kernel code and moved to
+///     SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions.
+///
+///     The second SGPR is 32 bit byte size of a single work-items scratch
+///     memory usage. This is directly loaded from the dispatch packet Private
+///     Segment Byte Size and rounded up to a multiple of DWORD.
+///
+///     \todo [Does CP need to round this to >4 byte alignment?]
+///
+///     The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in
+///     flat memory instructions. Having CP load it once avoids loading it at
+///     the beginning of every wavefront.
+///
+///   For PI:
+///     This is the 64 bit base address of the scratch backing memory for
+///     allocated by CP for this dispatch.
+///
+/// Private Segment Size (enable_sgpr_private_segment_size):
+///   Number of User SGPR registers: 1. The 32 bit byte size of a single
+///   work-items scratch memory allocation. This is the value from the dispatch
+///   packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD.
+///
+///   \todo [Does CP need to round this to >4 byte alignment?]
+///
+///   Having CP load it once avoids loading it at the beginning of every
+///   wavefront.
+///
+///   \todo [This will not be used for CI/VI since it is the same value as
+///   the second SGPR of Flat Scratch Init. However, it is need for PI which
+///   changes meaning of Flat Scratchg Init..]
+///
+/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x):
+///   Number of User SGPR registers: 1. 32 bit count of the number of
+///   work-groups in the X dimension for the grid being executed. Computed from
+///   the fields in the HsaDispatchPacket as
+///   ((gridSize.x+workgroupSize.x-1)/workgroupSize.x).
+///
+/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y):
+///   Number of User SGPR registers: 1. 32 bit count of the number of
+///   work-groups in the Y dimension for the grid being executed. Computed from
+///   the fields in the HsaDispatchPacket as
+///   ((gridSize.y+workgroupSize.y-1)/workgroupSize.y).
+///
+///   Only initialized if <16 previous SGPRs initialized.
+///
+/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z):
+///   Number of User SGPR registers: 1. 32 bit count of the number of
+///   work-groups in the Z dimension for the grid being executed. Computed
+///   from the fields in the HsaDispatchPacket as
+///   ((gridSize.z+workgroupSize.z-1)/workgroupSize.z).
+///
+///   Only initialized if <16 previous SGPRs initialized.
+///
+/// Work-Group Id X (enable_sgpr_workgroup_id_x):
+///   Number of System SGPR registers: 1. 32 bit work group id in X dimension
+///   of grid for wavefront. Always present.
+///
+/// Work-Group Id Y (enable_sgpr_workgroup_id_y):
+///   Number of System SGPR registers: 1. 32 bit work group id in Y dimension
+///   of grid for wavefront.
+///
+/// Work-Group Id Z (enable_sgpr_workgroup_id_z):
+///   Number of System SGPR registers: 1. 32 bit work group id in Z dimension
+///   of grid for wavefront. If present then Work-group Id Y will also be
+///   present
+///
+/// Work-Group Info (enable_sgpr_workgroup_info):
+///   Number of System SGPR registers: 1. {first_wave, 14b0000,
+///   ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}
+///
+/// Private Segment Wave Byte Offset
+/// (enable_sgpr_private_segment_wave_byte_offset):
+///   Number of System SGPR registers: 1. 32 bit byte offset from base of
+///   dispatch scratch base. Must be used as an offset with Private/Spill/Arg
+///   segment address when using Scratch Segment Buffer. It must be added to
+///   Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing.
+///
+///
+/// The order of the VGPR registers is defined, but the Finalizer can specify
+/// which ones are actually setup in the amd_kernel_code_t object using the
+/// enableVgpr*  bit fields. The register numbers used for enabled registers
+/// are dense starting at VGPR0: the first enabled register is VGPR0, the next
+/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR
+/// number.
+///
+/// VGPR register initial state is defined as follows:
+///
+/// Work-Item Id X (always initialized):
+///   Number of registers: 1. 32 bit work item id in X dimension of work-group
+///   for wavefront lane.
+///
+/// Work-Item Id X (enable_vgpr_workitem_id > 0):
+///   Number of registers: 1. 32 bit work item id in Y dimension of work-group
+///   for wavefront lane.
+///
+/// Work-Item Id X (enable_vgpr_workitem_id > 0):
+///   Number of registers: 1. 32 bit work item id in Z dimension of work-group
+///   for wavefront lane.
+///
+///
+/// The setting of registers is being done by existing GPU hardware as follows:
+///   1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data
+///      registers.
+///   2) Work-group Id registers X, Y, Z are set by SPI which supports any
+///      combination including none.
+///   3) Scratch Wave Offset is also set by SPI which is why its value cannot
+///      be added into the value Flat Scratch Offset which would avoid the
+///      Finalizer generated prolog having to do the add.
+///   4) The VGPRs are set by SPI which only supports specifying either (X),
+///      (X, Y) or (X, Y, Z).
+///
+/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so
+/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and
+/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register.
+///
+/// The global segment can be accessed either using flat operations or buffer
+/// operations. If buffer operations are used then the Global Buffer used to
+/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a
+/// segment address is not passed into the kernel code by CP since its base
+/// address is always 0. Instead the Finalizer generates prolog code to
+/// initialize 4 SGPRs with a V# that has the following properties, and then
+/// uses that in the buffer instructions:
+///   - base address of 0
+///   - no swizzle
+///   - ATC=1
+///   - MTYPE set to support memory coherence specified in
+///     amd_kernel_code_t.globalMemoryCoherence
+///
+/// When the Global Buffer is used to access the Kernarg segment, must add the
+/// dispatch packet kernArgPtr to a kernarg segment address before using this V#.
+/// Alternatively scalar loads can be used if the kernarg offset is uniform, as
+/// the kernarg segment is constant for the duration of the kernel execution.
+///
+typedef struct amd_kernel_code_s {
+  /// The AMD major version of the Code Object. Must be the value
+  /// AMD_CODE_VERSION_MAJOR.
+  amd_code_version32_t amd_code_version_major;
+
+  /// The AMD minor version of the Code Object. Minor versions must be
+  /// backward compatible. Must be the value
+  /// AMD_CODE_VERSION_MINOR.
+  amd_code_version32_t amd_code_version_minor;
+
+  /// The byte size of this struct. Must be set to
+  /// sizeof(amd_kernel_code_t). Used for backward
+  /// compatibility.
+  uint32_t struct_byte_size;
+
+  /// The target chip instruction set for which code has been
+  /// generated. Values are from the E_SC_INSTRUCTION_SET enumeration
+  /// in sc/Interface/SCCommon.h.
+  uint32_t target_chip;
+
+  /// Byte offset (possibly negative) from start of amd_kernel_code_t
+  /// object to kernel's entry point instruction. The actual code for
+  /// the kernel is required to be 256 byte aligned to match hardware
+  /// requirements (SQ cache line is 16). The code must be position
+  /// independent code (PIC) for AMD devices to give runtime the
+  /// option of copying code to discrete GPU memory or APU L2
+  /// cache. The Finalizer should endeavour to allocate all kernel
+  /// machine code in contiguous memory pages so that a device
+  /// pre-fetcher will tend to only pre-fetch Kernel Code objects,
+  /// improving cache performance.
+  int64_t kernel_code_entry_byte_offset;
+
+  /// Range of bytes to consider prefetching expressed as an offset
+  /// and size. The offset is from the start (possibly negative) of
+  /// amd_kernel_code_t object. Set both to 0 if no prefetch
+  /// information is available.
+  ///
+  /// \todo ttye 11/15/2013 Is the prefetch definition we want? Did
+  /// not make the size a uint64_t as prefetching more than 4GiB seems
+  /// excessive.
+  int64_t kernel_code_prefetch_byte_offset;
+  uint64_t kernel_code_prefetch_byte_size;
+
+  /// Number of bytes of scratch backing memory required for full
+  /// occupancy of target chip. This takes into account the number of
+  /// bytes of scratch per work-item, the wavefront size, the maximum
+  /// number of wavefronts per CU, and the number of CUs. This is an
+  /// upper limit on scratch. If the grid being dispatched is small it
+  /// may only need less than this. If the kernel uses no scratch, or
+  /// the Finalizer has not computed this value, it must be 0.
+  uint64_t max_scratch_backing_memory_byte_size;
+
+  /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
+  /// COMPUTE_PGM_RSRC2 registers.
+  amd_compute_pgm_resource_register64_t compute_pgm_resource_registers;
+
+  /// Code properties. See amd_code_property_mask_t for a full list of
+  /// properties.
+  amd_code_property32_t code_properties;
+
+  /// The amount of memory required for the combined private, spill
+  /// and arg segments for a work-item in bytes. If
+  /// is_dynamic_callstack is 1 then additional space must be added to
+  /// this value for the call stack.
+  uint32_t workitem_private_segment_byte_size;
+
+  /// The amount of group segment memory required by a work-group in
+  /// bytes. This does not include any dynamically allocated group
+  /// segment memory that may be added when the kernel is
+  /// dispatched.
+  uint32_t workgroup_group_segment_byte_size;
+
+  /// Number of byte of GDS required by kernel dispatch. Must be 0 if
+  /// not using GDS.
+  uint32_t gds_segment_byte_size;
+
+  /// The size in bytes of the kernarg segment that holds the values
+  /// of the arguments to the kernel. This could be used by CP to
+  /// prefetch the kernarg segment pointed to by the dispatch packet.
+  uint64_t kernarg_segment_byte_size;
+
+  /// Number of fbarrier's used in the kernel and all functions it
+  /// calls. If the implementation uses group memory to allocate the
+  /// fbarriers then that amount must already be included in the
+  /// workgroup_group_segment_byte_size total.
+  uint32_t workgroup_fbarrier_count;
+
+  /// Number of scalar registers used by a wavefront. This includes
+  /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
+  /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
+  /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
+  uint16_t wavefront_sgpr_count;
+
+  /// Number of vector registers used by each work-item. Used to set
+  /// COMPUTE_PGM_RSRC1.VGPRS.
+  uint16_t workitem_vgpr_count;
+
+  /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
+  /// first fixed VGPR number reserved.
+  uint16_t reserved_vgpr_first;
+
+  /// The number of consecutive VGPRs reserved by the client. If
+  /// is_debug_supported then this count includes VGPRs reserved
+  /// for debugger use.
+  uint16_t reserved_vgpr_count;
+
+  /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
+  /// first fixed SGPR number reserved.
+  uint16_t reserved_sgpr_first;
+
+  /// The number of consecutive SGPRs reserved by the client. If
+  /// is_debug_supported then this count includes SGPRs reserved
+  /// for debugger use.
+  uint16_t reserved_sgpr_count;
+
+  /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
+  /// fixed SGPR number used to hold the wave scratch offset for the
+  /// entire kernel execution, or uint16_t(-1) if the register is not
+  /// used or not known.
+  uint16_t debug_wavefront_private_segment_offset_sgpr;
+
+  /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
+  /// fixed SGPR number of the first of 4 SGPRs used to hold the
+  /// scratch V# used for the entire kernel execution, or uint16_t(-1)
+  /// if the registers are not used or not known.
+  uint16_t debug_private_segment_buffer_sgpr;
+
+  /// The maximum byte alignment of variables used by the kernel in
+  /// the specified memory segment. Expressed as a power of two. Must
+  /// be at least HSA_POWERTWO_16.
+  hsa_powertwo8_t kernarg_segment_alignment;
+  hsa_powertwo8_t group_segment_alignment;
+  hsa_powertwo8_t private_segment_alignment;
+
+  uint8_t reserved3;
+
+  /// Type of code object.
+  hsa_ext_code_kind32_t code_type;
+
+  /// Reserved for code properties if any are defined in the future.
+  /// There are currently no code properties so this field must be 0.
+  uint32_t reserved4;
+
+  /// Wavefront size expressed as a power of two. Must be a power of 2
+  /// in range 1..64 inclusive. Used to support runtime query that
+  /// obtains wavefront size, which may be used by application to
+  /// allocated dynamic group memory and set the dispatch work-group
+  /// size.
+  hsa_powertwo8_t wavefront_size;
+
+  /// The optimization level specified when the kernel was
+  /// finalized.
+  uint8_t optimization_level;
+
+  /// The HSAIL profile defines which features are used. This
+  /// information is from the HSAIL version directive. If this
+  /// amd_kernel_code_t is not generated from an HSAIL compilation
+  /// unit then must be 0.
+  hsa_ext_brig_profile8_t hsail_profile;
+
+  /// The HSAIL machine model gives the address sizes used by the
+  /// code. This information is from the HSAIL version directive. If
+  /// not generated from an HSAIL compilation unit then must still
+  /// indicate for what machine mode the code is generated.
+  hsa_ext_brig_machine_model8_t hsail_machine_model;
+
+  /// The HSAIL major version. This information is from the HSAIL
+  /// version directive. If this amd_kernel_code_t is not
+  /// generated from an HSAIL compilation unit then must be 0.
+  uint32_t hsail_version_major;
+
+  /// The HSAIL minor version. This information is from the HSAIL
+  /// version directive. If this amd_kernel_code_t is not
+  /// generated from an HSAIL compilation unit then must be 0.
+  uint32_t hsail_version_minor;
+
+  /// Reserved for HSAIL target options if any are defined in the
+  /// future. There are currently no target options so this field
+  /// must be 0.
+  uint16_t reserved5;
+
+  /// Reserved. Must be 0.
+  uint16_t reserved6;
+
+  /// The values should be the actually values used by the finalizer
+  /// in generating the code. This may be the union of values
+  /// specified as finalizer arguments and explicit HSAIL control
+  /// directives. If the finalizer chooses to ignore a control
+  /// directive, and not generate constrained code, then the control
+  /// directive should not be marked as enabled even though it was
+  /// present in the HSAIL or finalizer argument. The values are
+  /// intended to reflect the constraints that the code actually
+  /// requires to correctly execute, not the values that were
+  /// actually specified at finalize time.
+  hsa_ext_control_directives_t control_directive;
+
+  /// The code can immediately follow the amd_kernel_code_t, or can
+  /// come after subsequent amd_kernel_code_t structs when there are
+  /// multiple kernels in the compilation unit.
+
+} amd_kernel_code_t;
+
+#endif // AMDKERNELCODET_H
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
new file mode 100644
index 00000000000..0c9a68804a3
--- /dev/null
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -0,0 +1,1380 @@
+//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+namespace {
+
+struct OptionalOperand;
+
+class AMDGPUOperand : public MCParsedAsmOperand {
+  enum KindTy {
+    Token,
+    Immediate,
+    Register,
+    Expression
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+public:
+  AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+  MCContext *Ctx;
+
+  enum ImmTy {
+    ImmTyNone,
+    ImmTyDSOffset0,
+    ImmTyDSOffset1,
+    ImmTyGDS,
+    ImmTyOffset,
+    ImmTyGLC,
+    ImmTySLC,
+    ImmTyTFE,
+    ImmTyClamp,
+    ImmTyOMod
+  };
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct ImmOp {
+    bool IsFPImm;
+    ImmTy Type;
+    int64_t Val;
+  };
+
+  struct RegOp {
+    unsigned RegNo;
+    int Modifiers;
+    const MCRegisterInfo *TRI;
+    bool IsForcedVOP3;
+  };
+
+  union {
+    TokOp Tok;
+    ImmOp Imm;
+    RegOp Reg;
+    const MCExpr *Expr;
+  };
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::createImm(getImm()));
+  }
+
+  StringRef getToken() const {
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
+    if (isReg())
+      addRegOperands(Inst, N);
+    else
+      addImmOperands(Inst, N);
+  }
+
+  void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::createImm(
+        Reg.Modifiers == -1 ? 0 : Reg.Modifiers));
+    addRegOperands(Inst, N);
+  }
+
+  void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const {
+    if (isImm())
+      addImmOperands(Inst, N);
+    else {
+      assert(isExpr());
+      Inst.addOperand(MCOperand::createExpr(Expr));
+    }
+  }
+
+  bool defaultTokenHasSuffix() const {
+    StringRef Token(Tok.Data, Tok.Length);
+
+    return Token.endswith("_e32") || Token.endswith("_e64");
+  }
+
+  bool isToken() const override {
+    return Kind == Token;
+  }
+
+  bool isImm() const override {
+    return Kind == Immediate;
+  }
+
+  bool isInlineImm() const {
+    float F = BitsToFloat(Imm.Val);
+    // TODO: Add 0.5pi for VI
+    return isImm() && ((Imm.Val <= 64 && Imm.Val >= -16) ||
+           (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 ||
+           F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0));
+  }
+
+  bool isDSOffset0() const {
+    assert(isImm());
+    return Imm.Type == ImmTyDSOffset0;
+  }
+
+  bool isDSOffset1() const {
+    assert(isImm());
+    return Imm.Type == ImmTyDSOffset1;
+  }
+
+  int64_t getImm() const {
+    return Imm.Val;
+  }
+
+  enum ImmTy getImmTy() const {
+    assert(isImm());
+    return Imm.Type;
+  }
+
+  bool isRegKind() const {
+    return Kind == Register;
+  }
+
+  bool isReg() const override {
+    return Kind == Register && Reg.Modifiers == -1;
+  }
+
+  bool isRegWithInputMods() const {
+    return Kind == Register && (Reg.IsForcedVOP3 || Reg.Modifiers != -1);
+  }
+
+  void setModifiers(unsigned Mods) {
+    assert(isReg());
+    Reg.Modifiers = Mods;
+  }
+
+  bool hasModifiers() const {
+    assert(isRegKind());
+    return Reg.Modifiers != -1;
+  }
+
+  unsigned getReg() const override {
+    return Reg.RegNo;
+  }
+
+  bool isRegOrImm() const {
+    return isReg() || isImm();
+  }
+
+  bool isRegClass(unsigned RCID) const {
+    return Reg.TRI->getRegClass(RCID).contains(getReg());
+  }
+
+  bool isSCSrc32() const {
+    return isInlineImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID));
+  }
+
+  bool isSSrc32() const {
+    return isImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID));
+  }
+
+  bool isSSrc64() const {
+    return isImm() || isInlineImm() ||
+           (isReg() && isRegClass(AMDGPU::SReg_64RegClassID));
+  }
+
+  bool isVCSrc32() const {
+    return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID));
+  }
+
+  bool isVCSrc64() const {
+    return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID));
+  }
+
+  bool isVSrc32() const {
+    return isImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID));
+  }
+
+  bool isVSrc64() const {
+    return isImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID));
+  }
+
+  bool isMem() const override {
+    return false;
+  }
+
+  bool isExpr() const {
+    return Kind == Expression;
+  }
+
+  bool isSoppBrTarget() const {
+    return isExpr() || isImm();
+  }
+
+  SMLoc getStartLoc() const override {
+    return StartLoc;
+  }
+
+  SMLoc getEndLoc() const override {
+    return EndLoc;
+  }
+
+  void print(raw_ostream &OS) const override { }
+
+  static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc,
+                                                  enum ImmTy Type = ImmTyNone,
+                                                  bool IsFPImm = false) {
+    auto Op = llvm::make_unique<AMDGPUOperand>(Immediate);
+    Op->Imm.Val = Val;
+    Op->Imm.IsFPImm = IsFPImm;
+    Op->Imm.Type = Type;
+    Op->StartLoc = Loc;
+    Op->EndLoc = Loc;
+    return Op;
+  }
+
+  static std::unique_ptr<AMDGPUOperand> CreateToken(StringRef Str, SMLoc Loc,
+                                           bool HasExplicitEncodingSize = true) {
+    auto Res = llvm::make_unique<AMDGPUOperand>(Token);
+    Res->Tok.Data = Str.data();
+    Res->Tok.Length = Str.size();
+    Res->StartLoc = Loc;
+    Res->EndLoc = Loc;
+    return Res;
+  }
+
+  static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S,
+                                                  SMLoc E,
+                                                  const MCRegisterInfo *TRI,
+                                                  bool ForceVOP3) {
+    auto Op = llvm::make_unique<AMDGPUOperand>(Register);
+    Op->Reg.RegNo = RegNo;
+    Op->Reg.TRI = TRI;
+    Op->Reg.Modifiers = -1;
+    Op->Reg.IsForcedVOP3 = ForceVOP3;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AMDGPUOperand> CreateExpr(const class MCExpr *Expr, SMLoc S) {
+    auto Op = llvm::make_unique<AMDGPUOperand>(Expression);
+    Op->Expr = Expr;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  bool isDSOffset() const;
+  bool isDSOffset01() const;
+  bool isSWaitCnt() const;
+  bool isMubufOffset() const;
+};
+
+class AMDGPUAsmParser : public MCTargetAsmParser {
+  MCSubtargetInfo &STI;
+  const MCInstrInfo &MII;
+  MCAsmParser &Parser;
+
+  unsigned ForcedEncodingSize;
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "AMDGPUGenAsmMatcher.inc"
+
+  /// }
+
+public:
+  AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &_Parser,
+               const MCInstrInfo &MII,
+               const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(STI), MII(MII), Parser(_Parser),
+        ForcedEncodingSize(0){
+
+    if (STI.getFeatureBits().none()) {
+      // Set default features.
+      STI.ToggleFeature("SOUTHERN_ISLANDS");
+    }
+
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+  unsigned getForcedEncodingSize() const {
+    return ForcedEncodingSize;
+  }
+
+  void setForcedEncodingSize(unsigned Size) {
+    ForcedEncodingSize = Size;
+  }
+
+  bool isForcedVOP3() const {
+    return ForcedEncodingSize == 64;
+  }
+
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
+  OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic);
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int,
+                                          int64_t Default = 0);
+  OperandMatchResultTy parseIntWithPrefix(const char *Prefix,
+                                          OperandVector &Operands,
+                                          enum AMDGPUOperand::ImmTy ImmTy =
+                                                      AMDGPUOperand::ImmTyNone);
+  OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands,
+                                     enum AMDGPUOperand::ImmTy ImmTy =
+                                                      AMDGPUOperand::ImmTyNone);
+  OperandMatchResultTy parseOptionalOps(
+                                   const ArrayRef<OptionalOperand> &OptionalOps,
+                                   OperandVector &Operands);
+
+
+  void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
+  void cvtDS(MCInst &Inst, const OperandVector &Operands);
+  OperandMatchResultTy parseDSOptionalOps(OperandVector &Operands);
+  OperandMatchResultTy parseDSOff01OptionalOps(OperandVector &Operands);
+  OperandMatchResultTy parseDSOffsetOptional(OperandVector &Operands);
+
+  bool parseCnt(int64_t &IntVal);
+  OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands);
+  OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
+
+  OperandMatchResultTy parseFlatOptionalOps(OperandVector &Operands);
+  OperandMatchResultTy parseFlatAtomicOptionalOps(OperandVector &Operands);
+  void cvtFlat(MCInst &Inst, const OperandVector &Operands);
+
+  void cvtMubuf(MCInst &Inst, const OperandVector &Operands);
+  OperandMatchResultTy parseOffset(OperandVector &Operands);
+  OperandMatchResultTy parseMubufOptionalOps(OperandVector &Operands);
+  OperandMatchResultTy parseGLC(OperandVector &Operands);
+  OperandMatchResultTy parseSLC(OperandVector &Operands);
+  OperandMatchResultTy parseTFE(OperandVector &Operands);
+
+  OperandMatchResultTy parseDMask(OperandVector &Operands);
+  OperandMatchResultTy parseUNorm(OperandVector &Operands);
+  OperandMatchResultTy parseR128(OperandVector &Operands);
+
+  void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
+  OperandMatchResultTy parseVOP3OptionalOps(OperandVector &Operands);
+};
+
+struct OptionalOperand {
+  const char *Name;
+  AMDGPUOperand::ImmTy Type;
+  bool IsBit;
+  int64_t Default;
+  bool (*ConvertResult)(int64_t&);
+};
+
+}
+
+static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) {
+  if (IsVgpr) {
+    switch (RegWidth) {
+      default: llvm_unreachable("Unknown register width");
+      case 1: return AMDGPU::VGPR_32RegClassID;
+      case 2: return AMDGPU::VReg_64RegClassID;
+      case 3: return AMDGPU::VReg_96RegClassID;
+      case 4: return AMDGPU::VReg_128RegClassID;
+      case 8: return AMDGPU::VReg_256RegClassID;
+      case 16: return AMDGPU::VReg_512RegClassID;
+    }
+  }
+
+  switch (RegWidth) {
+    default: llvm_unreachable("Unknown register width");
+    case 1: return AMDGPU::SGPR_32RegClassID;
+    case 2: return AMDGPU::SGPR_64RegClassID;
+    case 4: return AMDGPU::SReg_128RegClassID;
+    case 8: return AMDGPU::SReg_256RegClassID;
+    case 16: return AMDGPU::SReg_512RegClassID;
+  }
+}
+
+static unsigned getRegForName(const StringRef &RegName) {
+
+  return StringSwitch<unsigned>(RegName)
+    .Case("exec", AMDGPU::EXEC)
+    .Case("vcc", AMDGPU::VCC)
+    .Case("flat_scr", AMDGPU::FLAT_SCR)
+    .Case("m0", AMDGPU::M0)
+    .Case("scc", AMDGPU::SCC)
+    .Case("flat_scr_lo", AMDGPU::FLAT_SCR_LO)
+    .Case("flat_scr_hi", AMDGPU::FLAT_SCR_HI)
+    .Case("vcc_lo", AMDGPU::VCC_LO)
+    .Case("vcc_hi", AMDGPU::VCC_HI)
+    .Case("exec_lo", AMDGPU::EXEC_LO)
+    .Case("exec_hi", AMDGPU::EXEC_HI)
+    .Default(0);
+}
+
+bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  const AsmToken Tok = Parser.getTok();
+  StartLoc = Tok.getLoc();
+  EndLoc = Tok.getEndLoc();
+  const StringRef &RegName = Tok.getString();
+  RegNo = getRegForName(RegName);
+
+  if (RegNo) {
+    Parser.Lex();
+    return false;
+  }
+
+  // Match vgprs and sgprs
+  if (RegName[0] != 's' && RegName[0] != 'v')
+    return true;
+
+  bool IsVgpr = RegName[0] == 'v';
+  unsigned RegWidth;
+  unsigned RegIndexInClass;
+  if (RegName.size() > 1) {
+    // We have a 32-bit register
+    RegWidth = 1;
+    if (RegName.substr(1).getAsInteger(10, RegIndexInClass))
+      return true;
+    Parser.Lex();
+  } else {
+    // We have a register greater than 32-bits.
+
+    int64_t RegLo, RegHi;
+    Parser.Lex();
+    if (getLexer().isNot(AsmToken::LBrac))
+      return true;
+
+    Parser.Lex();
+    if (getParser().parseAbsoluteExpression(RegLo))
+      return true;
+
+    if (getLexer().isNot(AsmToken::Colon))
+      return true;
+
+    Parser.Lex();
+    if (getParser().parseAbsoluteExpression(RegHi))
+      return true;
+
+    if (getLexer().isNot(AsmToken::RBrac))
+      return true;
+
+    Parser.Lex();
+    RegWidth = (RegHi - RegLo) + 1;
+    if (IsVgpr) {
+      // VGPR registers aren't aligned.
+      RegIndexInClass = RegLo;
+    } else {
+      // SGPR registers are aligned.  Max alignment is 4 dwords.
+      RegIndexInClass = RegLo / std::min(RegWidth, 4u);
+    }
+  }
+
+  const MCRegisterInfo *TRC = getContext().getRegisterInfo();
+  unsigned RC = getRegClass(IsVgpr, RegWidth);
+  if (RegIndexInClass > TRC->getRegClass(RC).getNumRegs())
+    return true;
+  RegNo = TRC->getRegClass(RC).getRegister(RegIndexInClass);
+  return false;
+}
+
+unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+
+  uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+
+  if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) ||
+      (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3)))
+    return Match_InvalidOperand;
+
+  return Match_Success;
+}
+
+
+bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                              OperandVector &Operands,
+                                              MCStreamer &Out,
+                                              uint64_t &ErrorInfo,
+                                              bool MatchingInlineAsm) {
+  MCInst Inst;
+
+  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
+    default: break;
+    case Match_Success:
+      Inst.setLoc(IDLoc);
+      Out.EmitInstruction(Inst, STI);
+      return false;
+    case Match_MissingFeature:
+      return Error(IDLoc, "instruction not supported on this GPU");
+
+    case Match_MnemonicFail:
+      return Error(IDLoc, "unrecognized instruction mnemonic");
+
+    case Match_InvalidOperand: {
+      SMLoc ErrorLoc = IDLoc;
+      if (ErrorInfo != ~0ULL) {
+        if (ErrorInfo >= Operands.size()) {
+          if (isForcedVOP3()) {
+            // If 64-bit encoding has been forced we can end up with no
+            // clamp or omod operands if none of the registers have modifiers,
+            // so we need to add these to the operand list.
+            AMDGPUOperand &LastOp =
+                ((AMDGPUOperand &)*Operands[Operands.size() - 1]);
+            if (LastOp.isRegKind() ||
+               (LastOp.isImm() &&
+                LastOp.getImmTy() != AMDGPUOperand::ImmTyNone)) {
+              SMLoc S = Parser.getTok().getLoc();
+              Operands.push_back(AMDGPUOperand::CreateImm(0, S,
+                                 AMDGPUOperand::ImmTyClamp));
+              Operands.push_back(AMDGPUOperand::CreateImm(0, S,
+                                 AMDGPUOperand::ImmTyOMod));
+              bool Res = MatchAndEmitInstruction(IDLoc, Opcode, Operands,
+                                                 Out, ErrorInfo,
+                                                 MatchingInlineAsm);
+              if (!Res)
+                return Res;
+            }
+
+          }
+          return Error(IDLoc, "too few operands for instruction");
+        }
+
+        ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc();
+        if (ErrorLoc == SMLoc())
+          ErrorLoc = IDLoc;
+      }
+      return Error(ErrorLoc, "invalid operand for instruction");
+    }
+  }
+  llvm_unreachable("Implement any new match types added!");
+}
+
+bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
+  return true;
+}
+
+static bool operandsHaveModifiers(const OperandVector &Operands) {
+
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+    const AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]);
+    if (Op.isRegKind() && Op.hasModifiers())
+      return true;
+    if (Op.isImm() && (Op.getImmTy() == AMDGPUOperand::ImmTyOMod ||
+                       Op.getImmTy() == AMDGPUOperand::ImmTyClamp))
+      return true;
+  }
+  return false;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+
+  // Try to parse with a custom parser
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+
+  // If we successfully parsed the operand or if there as an error parsing,
+  // we are done.
+  //
+  // If we are parsing after we reach EndOfStatement then this means we
+  // are appending default values to the Operands list.  This is only done
+  // by custom parser, so we shouldn't continue on to the generic parsing.
+  if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail ||
+      getLexer().is(AsmToken::EndOfStatement))
+    return ResTy;
+
+  bool Negate = false, Abs = false;
+  if (getLexer().getKind()== AsmToken::Minus) {
+    Parser.Lex();
+    Negate = true;
+  }
+
+  if (getLexer().getKind() == AsmToken::Pipe) {
+    Parser.Lex();
+    Abs = true;
+  }
+
+  switch(getLexer().getKind()) {
+    case AsmToken::Integer: {
+      SMLoc S = Parser.getTok().getLoc();
+      int64_t IntVal;
+      if (getParser().parseAbsoluteExpression(IntVal))
+        return MatchOperand_ParseFail;
+      APInt IntVal32(32, IntVal);
+      if (IntVal32.getSExtValue() != IntVal) {
+        Error(S, "invalid immediate: only 32-bit values are legal");
+        return MatchOperand_ParseFail;
+      }
+
+      IntVal = IntVal32.getSExtValue();
+      if (Negate)
+        IntVal *= -1;
+      Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S));
+      return MatchOperand_Success;
+    }
+    case AsmToken::Real: {
+      // FIXME: We should emit an error if a double precisions floating-point
+      // value is used.  I'm not sure the best way to detect this.
+      SMLoc S = Parser.getTok().getLoc();
+      int64_t IntVal;
+      if (getParser().parseAbsoluteExpression(IntVal))
+        return MatchOperand_ParseFail;
+
+      APFloat F((float)BitsToDouble(IntVal));
+      if (Negate)
+        F.changeSign();
+      Operands.push_back(
+          AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S));
+      return MatchOperand_Success;
+    }
+    case AsmToken::Identifier: {
+      SMLoc S, E;
+      unsigned RegNo;
+      if (!ParseRegister(RegNo, S, E)) {
+
+        bool HasModifiers = operandsHaveModifiers(Operands);
+        unsigned Modifiers = 0;
+
+        if (Negate)
+          Modifiers |= 0x1;
+
+        if (Abs) {
+          if (getLexer().getKind() != AsmToken::Pipe)
+            return MatchOperand_ParseFail;
+          Parser.Lex();
+          Modifiers |= 0x2;
+        }
+
+        if (Modifiers && !HasModifiers) {
+          // We are adding a modifier to src1 or src2 and previous sources
+          // don't have modifiers, so we need to go back and empty modifers
+          // for each previous source.
+          for (unsigned PrevRegIdx = Operands.size() - 1; PrevRegIdx > 1;
+               --PrevRegIdx) {
+
+            AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[PrevRegIdx]);
+            RegOp.setModifiers(0);
+          }
+        }
+
+
+        Operands.push_back(AMDGPUOperand::CreateReg(
+            RegNo, S, E, getContext().getRegisterInfo(),
+            isForcedVOP3()));
+
+        if (HasModifiers || Modifiers) {
+          AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[Operands.size() - 1]);
+          RegOp.setModifiers(Modifiers);
+
+        }
+     }  else {
+      Operands.push_back(AMDGPUOperand::CreateToken(Parser.getTok().getString(),
+                                                    S));
+      Parser.Lex();
+     }
+     return MatchOperand_Success;
+    }
+    default:
+      return MatchOperand_NoMatch;
+  }
+}
+
+bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                       StringRef Name,
+                                       SMLoc NameLoc, OperandVector &Operands) {
+
+  // Clear any forced encodings from the previous instruction.
+  setForcedEncodingSize(0);
+
+  if (Name.endswith("_e64"))
+    setForcedEncodingSize(64);
+  else if (Name.endswith("_e32"))
+    setForcedEncodingSize(32);
+
+  // Add the instruction mnemonic
+  Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc));
+
+  while (!getLexer().is(AsmToken::EndOfStatement)) {
+    AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name);
+
+    // Eat the comma or space if there is one.
+    if (getLexer().is(AsmToken::Comma))
+      Parser.Lex();
+
+    switch (Res) {
+      case MatchOperand_Success: break;
+      case MatchOperand_ParseFail: return Error(getLexer().getLoc(),
+                                                "failed parsing operand.");
+      case MatchOperand_NoMatch: return Error(getLexer().getLoc(),
+                                              "not a valid operand.");
+    }
+  }
+
+  // Once we reach end of statement, continue parsing so we can add default
+  // values for optional arguments.
+  AMDGPUAsmParser::OperandMatchResultTy Res;
+  while ((Res = parseOperand(Operands, Name)) != MatchOperand_NoMatch) {
+    if (Res != MatchOperand_Success)
+      return Error(getLexer().getLoc(), "failed parsing operand.");
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Utility functions
+//===----------------------------------------------------------------------===//
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int,
+                                    int64_t Default) {
+
+  // We are at the end of the statement, and this is a default argument, so
+  // use a default value.
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    Int = Default;
+    return MatchOperand_Success;
+  }
+
+  switch(getLexer().getKind()) {
+    default: return MatchOperand_NoMatch;
+    case AsmToken::Identifier: {
+      StringRef OffsetName = Parser.getTok().getString();
+      if (!OffsetName.equals(Prefix))
+        return MatchOperand_NoMatch;
+
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Colon))
+        return MatchOperand_ParseFail;
+
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Integer))
+        return MatchOperand_ParseFail;
+
+      if (getParser().parseAbsoluteExpression(Int))
+        return MatchOperand_ParseFail;
+      break;
+    }
+  }
+  return MatchOperand_Success;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
+                                    enum AMDGPUOperand::ImmTy ImmTy) {
+
+  SMLoc S = Parser.getTok().getLoc();
+  int64_t Offset = 0;
+
+  AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Offset);
+  if (Res != MatchOperand_Success)
+    return Res;
+
+  Operands.push_back(AMDGPUOperand::CreateImm(Offset, S, ImmTy));
+  return MatchOperand_Success;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
+                               enum AMDGPUOperand::ImmTy ImmTy) {
+  int64_t Bit = 0;
+  SMLoc S = Parser.getTok().getLoc();
+
+  // We are at the end of the statement, and this is a default argument, so
+  // use a default value.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    switch(getLexer().getKind()) {
+      case AsmToken::Identifier: {
+        StringRef Tok = Parser.getTok().getString();
+        if (Tok == Name) {
+          Bit = 1;
+          Parser.Lex();
+        } else if (Tok.startswith("no") && Tok.endswith(Name)) {
+          Bit = 0;
+          Parser.Lex();
+        } else {
+          return MatchOperand_NoMatch;
+        }
+        break;
+      }
+      default:
+        return MatchOperand_NoMatch;
+    }
+  }
+
+  Operands.push_back(AMDGPUOperand::CreateImm(Bit, S, ImmTy));
+  return MatchOperand_Success;
+}
+
+static bool operandsHasOptionalOp(const OperandVector &Operands,
+                                  const OptionalOperand &OOp) {
+  for (unsigned i = 0; i < Operands.size(); i++) {
+    const AMDGPUOperand &ParsedOp = ((const AMDGPUOperand &)*Operands[i]);
+    if ((ParsedOp.isImm() && ParsedOp.getImmTy() == OOp.Type) ||
+        (ParsedOp.isToken() && ParsedOp.getToken() == OOp.Name))
+      return true;
+
+  }
+  return false;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseOptionalOps(const ArrayRef<OptionalOperand> &OptionalOps,
+                                   OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  for (const OptionalOperand &Op : OptionalOps) {
+    if (operandsHasOptionalOp(Operands, Op))
+      continue;
+    AMDGPUAsmParser::OperandMatchResultTy Res;
+    int64_t Value;
+    if (Op.IsBit) {
+      Res = parseNamedBit(Op.Name, Operands, Op.Type);
+      if (Res == MatchOperand_NoMatch)
+        continue;
+      return Res;
+    }
+
+    Res = parseIntWithPrefix(Op.Name, Value, Op.Default);
+
+    if (Res == MatchOperand_NoMatch)
+      continue;
+
+    if (Res != MatchOperand_Success)
+      return Res;
+
+    if (Op.ConvertResult && !Op.ConvertResult(Value)) {
+      return MatchOperand_ParseFail;
+    }
+
+    Operands.push_back(AMDGPUOperand::CreateImm(Value, S, Op.Type));
+    return MatchOperand_Success;
+  }
+  return MatchOperand_NoMatch;
+}
+
+//===----------------------------------------------------------------------===//
+// ds
+//===----------------------------------------------------------------------===//
+
+static const OptionalOperand DSOptionalOps [] = {
+  {"offset",  AMDGPUOperand::ImmTyOffset, false, 0, nullptr},
+  {"gds",     AMDGPUOperand::ImmTyGDS, true, 0, nullptr}
+};
+
+static const OptionalOperand DSOptionalOpsOff01 [] = {
+  {"offset0", AMDGPUOperand::ImmTyDSOffset0, false, 0, nullptr},
+  {"offset1", AMDGPUOperand::ImmTyDSOffset1, false, 0, nullptr},
+  {"gds",     AMDGPUOperand::ImmTyGDS, true, 0, nullptr}
+};
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseDSOptionalOps(OperandVector &Operands) {
+  return parseOptionalOps(DSOptionalOps, Operands);
+}
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseDSOff01OptionalOps(OperandVector &Operands) {
+  return parseOptionalOps(DSOptionalOpsOff01, Operands);
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseDSOffsetOptional(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  AMDGPUAsmParser::OperandMatchResultTy Res =
+    parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset);
+  if (Res == MatchOperand_NoMatch) {
+    Operands.push_back(AMDGPUOperand::CreateImm(0, S,
+                       AMDGPUOperand::ImmTyOffset));
+    Res = MatchOperand_Success;
+  }
+  return Res;
+}
+
+bool AMDGPUOperand::isDSOffset() const {
+  return isImm() && isUInt<16>(getImm());
+}
+
+bool AMDGPUOperand::isDSOffset01() const {
+  return isImm() && isUInt<8>(getImm());
+}
+
+void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
+                                    const OperandVector &Operands) {
+
+  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
+  }
+
+  unsigned Offset0Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset0];
+  unsigned Offset1Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset1];
+  unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS];
+
+  ((AMDGPUOperand &)*Operands[Offset0Idx]).addImmOperands(Inst, 1); // offset0
+  ((AMDGPUOperand &)*Operands[Offset1Idx]).addImmOperands(Inst, 1); // offset1
+  ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds
+  Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
+}
+
+void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
+
+  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+  bool GDSOnly = false;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
+      continue;
+    }
+
+    if (Op.isToken() && Op.getToken() == "gds") {
+      GDSOnly = true;
+      continue;
+    }
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
+  }
+
+  unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset];
+  ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); // offset
+
+  if (!GDSOnly) {
+    unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS];
+    ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds
+  }
+  Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
+}
+
+
+//===----------------------------------------------------------------------===//
+// s_waitcnt
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
+  StringRef CntName = Parser.getTok().getString();
+  int64_t CntVal;
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::LParen))
+    return true;
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::Integer))
+    return true;
+
+  if (getParser().parseAbsoluteExpression(CntVal))
+    return true;
+
+  if (getLexer().isNot(AsmToken::RParen))
+    return true;
+
+  Parser.Lex();
+  if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma))
+    Parser.Lex();
+
+  int CntShift;
+  int CntMask;
+
+  if (CntName == "vmcnt") {
+    CntMask = 0xf;
+    CntShift = 0;
+  } else if (CntName == "expcnt") {
+    CntMask = 0x7;
+    CntShift = 4;
+  } else if (CntName == "lgkmcnt") {
+    CntMask = 0x7;
+    CntShift = 8;
+  } else {
+    return true;
+  }
+
+  IntVal &= ~(CntMask << CntShift);
+  IntVal |= (CntVal << CntShift);
+  return false;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
+  // Disable all counters by default.
+  // vmcnt   [3:0]
+  // expcnt  [6:4]
+  // lgkmcnt [10:8]
+  int64_t CntVal = 0x77f;
+  SMLoc S = Parser.getTok().getLoc();
+
+  switch(getLexer().getKind()) {
+    default: return MatchOperand_ParseFail;
+    case AsmToken::Integer:
+      // The operand can be an integer value.
+      if (getParser().parseAbsoluteExpression(CntVal))
+        return MatchOperand_ParseFail;
+      break;
+
+    case AsmToken::Identifier:
+      do {
+        if (parseCnt(CntVal))
+          return MatchOperand_ParseFail;
+      } while(getLexer().isNot(AsmToken::EndOfStatement));
+      break;
+  }
+  Operands.push_back(AMDGPUOperand::CreateImm(CntVal, S));
+  return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isSWaitCnt() const {
+  return isImm();
+}
+
+//===----------------------------------------------------------------------===//
+// sopp branch targets
+//===----------------------------------------------------------------------===//
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+
+  switch (getLexer().getKind()) {
+    default: return MatchOperand_ParseFail;
+    case AsmToken::Integer: {
+      int64_t Imm;
+      if (getParser().parseAbsoluteExpression(Imm))
+        return MatchOperand_ParseFail;
+      Operands.push_back(AMDGPUOperand::CreateImm(Imm, S));
+      return MatchOperand_Success;
+    }
+
+    case AsmToken::Identifier:
+      Operands.push_back(AMDGPUOperand::CreateExpr(
+          MCSymbolRefExpr::create(getContext().getOrCreateSymbol(
+                                  Parser.getTok().getString()), getContext()), S));
+      Parser.Lex();
+      return MatchOperand_Success;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// flat
+//===----------------------------------------------------------------------===//
+
+static const OptionalOperand FlatOptionalOps [] = {
+  {"glc",    AMDGPUOperand::ImmTyGLC, true, 0, nullptr},
+  {"slc",    AMDGPUOperand::ImmTySLC, true, 0, nullptr},
+  {"tfe",    AMDGPUOperand::ImmTyTFE, true, 0, nullptr}
+};
+
+static const OptionalOperand FlatAtomicOptionalOps [] = {
+  {"slc",    AMDGPUOperand::ImmTySLC, true, 0, nullptr},
+  {"tfe",    AMDGPUOperand::ImmTyTFE, true, 0, nullptr}
+};
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseFlatOptionalOps(OperandVector &Operands) {
+  return parseOptionalOps(FlatOptionalOps, Operands);
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseFlatAtomicOptionalOps(OperandVector &Operands) {
+  return parseOptionalOps(FlatAtomicOptionalOps, Operands);
+}
+
+void AMDGPUAsmParser::cvtFlat(MCInst &Inst,
+                               const OperandVector &Operands) {
+  std::map<AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle 'glc' token which is sometimes hard-coded into the
+    // asm string.  There are no MCInst operands for these.
+    if (Op.isToken())
+      continue;
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
+
+  }
+
+  // flat atomic instructions don't have a glc argument.
+  if (OptionalIdx.count(AMDGPUOperand::ImmTyGLC)) {
+    unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC];
+    ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1);
+  }
+
+  unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC];
+  unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE];
+
+  ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1);
+  ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1);
+}
+
+//===----------------------------------------------------------------------===//
+// mubuf
+//===----------------------------------------------------------------------===//
+
+static const OptionalOperand MubufOptionalOps [] = {
+  {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr},
+  {"glc",    AMDGPUOperand::ImmTyGLC, true, 0, nullptr},
+  {"slc",    AMDGPUOperand::ImmTySLC, true, 0, nullptr},
+  {"tfe",    AMDGPUOperand::ImmTyTFE, true, 0, nullptr}
+};
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseMubufOptionalOps(OperandVector &Operands) {
+  return parseOptionalOps(MubufOptionalOps, Operands);
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseOffset(OperandVector &Operands) {
+  return parseIntWithPrefix("offset", Operands);
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseGLC(OperandVector &Operands) {
+  return parseNamedBit("glc", Operands);
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseSLC(OperandVector &Operands) {
+  return parseNamedBit("slc", Operands);
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseTFE(OperandVector &Operands) {
+  return parseNamedBit("tfe", Operands);
+}
+
+bool AMDGPUOperand::isMubufOffset() const {
+  return isImm() && isUInt<12>(getImm());
+}
+
+void AMDGPUAsmParser::cvtMubuf(MCInst &Inst,
+                               const OperandVector &Operands) {
+  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle the case where soffset is an immediate
+    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+      Op.addImmOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle tokens like 'offen' which are sometimes hard-coded into the
+    // asm string.  There are no MCInst operands for these.
+    if (Op.isToken()) {
+      continue;
+    }
+    assert(Op.isImm());
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
+  }
+
+  assert(OptionalIdx.size() == 4);
+
+  unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset];
+  unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC];
+  unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC];
+  unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE];
+
+  ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1);
+  ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1);
+  ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1);
+  ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1);
+}
+
+//===----------------------------------------------------------------------===//
+// mimg
+//===----------------------------------------------------------------------===//
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseDMask(OperandVector &Operands) {
+  return parseIntWithPrefix("dmask", Operands);
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseUNorm(OperandVector &Operands) {
+  return parseNamedBit("unorm", Operands);
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseR128(OperandVector &Operands) {
+  return parseNamedBit("r128", Operands);
+}
+
+//===----------------------------------------------------------------------===//
+// vop3
+//===----------------------------------------------------------------------===//
+
+static bool ConvertOmodMul(int64_t &Mul) {
+  if (Mul != 1 && Mul != 2 && Mul != 4)
+    return false;
+
+  Mul >>= 1;
+  return true;
+}
+
+static bool ConvertOmodDiv(int64_t &Div) {
+  if (Div == 1) {
+    Div = 0;
+    return true;
+  }
+
+  if (Div == 2) {
+    Div = 3;
+    return true;
+  }
+
+  return false;
+}
+
+static const OptionalOperand VOP3OptionalOps [] = {
+  {"clamp", AMDGPUOperand::ImmTyClamp, true, 0, nullptr},
+  {"mul",   AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodMul},
+  {"div",   AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodDiv},
+};
+
+static bool isVOP3(OperandVector &Operands) {
+  if (operandsHaveModifiers(Operands))
+    return true;
+
+  AMDGPUOperand &DstOp = ((AMDGPUOperand&)*Operands[1]);
+
+  if (DstOp.isReg() && DstOp.isRegClass(AMDGPU::SGPR_64RegClassID))
+    return true;
+
+  if (Operands.size() >= 5)
+    return true;
+
+  if (Operands.size() > 3) {
+    AMDGPUOperand &Src1Op = ((AMDGPUOperand&)*Operands[3]);
+    if (Src1Op.getReg() && (Src1Op.isRegClass(AMDGPU::SReg_32RegClassID) ||
+                            Src1Op.isRegClass(AMDGPU::SReg_64RegClassID)))
+      return true;
+  }
+  return false;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) {
+
+  // The value returned by this function may change after parsing
+  // an operand so store the original value here.
+  bool HasModifiers = operandsHaveModifiers(Operands);
+
+  bool IsVOP3 = isVOP3(Operands);
+  if (HasModifiers || IsVOP3 ||
+      getLexer().isNot(AsmToken::EndOfStatement) ||
+      getForcedEncodingSize() == 64) {
+
+    AMDGPUAsmParser::OperandMatchResultTy Res =
+        parseOptionalOps(VOP3OptionalOps, Operands);
+
+    if (!HasModifiers && Res == MatchOperand_Success) {
+      // We have added a modifier operation, so we need to make sure all
+      // previous register operands have modifiers
+      for (unsigned i = 2, e = Operands.size(); i != e; ++i) {
+        AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]);
+        if (Op.isReg())
+          Op.setModifiers(0);
+      }
+    }
+    return Res;
+  }
+  return MatchOperand_NoMatch;
+}
+
+void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
+  ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1);
+  unsigned i = 2;
+
+  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+
+  if (operandsHaveModifiers(Operands)) {
+    for (unsigned e = Operands.size(); i != e; ++i) {
+      AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+      if (Op.isRegWithInputMods()) {
+        ((AMDGPUOperand &)*Operands[i]).addRegWithInputModsOperands(Inst, 2);
+        continue;
+      }
+      OptionalIdx[Op.getImmTy()] = i;
+    }
+
+    unsigned ClampIdx = OptionalIdx[AMDGPUOperand::ImmTyClamp];
+    unsigned OModIdx = OptionalIdx[AMDGPUOperand::ImmTyOMod];
+
+    ((AMDGPUOperand &)*Operands[ClampIdx]).addImmOperands(Inst, 1);
+    ((AMDGPUOperand &)*Operands[OModIdx]).addImmOperands(Inst, 1);
+  } else {
+    for (unsigned e = Operands.size(); i != e; ++i)
+      ((AMDGPUOperand &)*Operands[i]).addRegOrImmOperands(Inst, 1);
+  }
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeAMDGPUAsmParser() {
+  RegisterMCAsmParser<AMDGPUAsmParser> A(TheAMDGPUTarget);
+  RegisterMCAsmParser<AMDGPUAsmParser> B(TheGCNTarget);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "AMDGPUGenAsmMatcher.inc"
+
diff --git a/lib/Target/AMDGPU/AsmParser/CMakeLists.txt b/lib/Target/AMDGPU/AsmParser/CMakeLists.txt
new file mode 100644
index 00000000000..21ddc4eb83d
--- /dev/null
+++ b/lib/Target/AMDGPU/AsmParser/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMAMDGPUAsmParser
+  AMDGPUAsmParser.cpp
+  )
diff --git a/lib/Target/AMDGPU/AsmParser/LLVMBuild.txt b/lib/Target/AMDGPU/AsmParser/LLVMBuild.txt
new file mode 100644
index 00000000000..63d44d1e06f
--- /dev/null
+++ b/lib/Target/AMDGPU/AsmParser/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/AMDGPU/AsmParser/LLVMBuild.txt -------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AMDGPUAsmParser
+parent = AMDGPU
+required_libraries = MC MCParser AMDGPUDesc AMDGPUInfo Support
+add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/AsmParser/Makefile b/lib/Target/AMDGPU/AsmParser/Makefile
new file mode 100644
index 00000000000..e6689b54b6b
--- /dev/null
+++ b/lib/Target/AMDGPU/AsmParser/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/R600/AsmParser/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMR600AsmParser
+
+# Hack: we need to include 'main' R600 target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/CIInstructions.td b/lib/Target/AMDGPU/CIInstructions.td
new file mode 100644
index 00000000000..2f5fdbe9207
--- /dev/null
+++ b/lib/Target/AMDGPU/CIInstructions.td
@@ -0,0 +1,149 @@
+//===-- CIInstructions.td - CI Instruction Defintions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Instruction definitions for CI and newer.
+//===----------------------------------------------------------------------===//
+
+
+def isCIVI : Predicate <
+  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
+  "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
+>, AssemblerPredicate<"FeatureCIInsts">;
+
+def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
+
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isCIVI in {
+
+defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64",
+  VOP_F64_F64, ftrunc
+>;
+defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64",
+  VOP_F64_F64, fceil
+>;
+defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64",
+  VOP_F64_F64, ffloor
+>;
+defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64",
+  VOP_F64_F64, frint
+>;
+defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32",
+  VOP_F32_F32
+>;
+defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32",
+  VOP_F32_F32
+>;
+
+//===----------------------------------------------------------------------===//
+// Flat Instructions
+//===----------------------------------------------------------------------===//
+
+def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x8, "flat_load_ubyte", VGPR_32>;
+def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x9, "flat_load_sbyte", VGPR_32>;
+def FLAT_LOAD_USHORT : FLAT_Load_Helper <0xa, "flat_load_ushort", VGPR_32>;
+def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0xb, "flat_load_sshort", VGPR_32>;
+def FLAT_LOAD_DWORD : FLAT_Load_Helper <0xc, "flat_load_dword", VGPR_32>;
+def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0xd, "flat_load_dwordx2", VReg_64>;
+def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0xe, "flat_load_dwordx4", VReg_128>;
+def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0xf, "flat_load_dwordx3", VReg_96>;
+def FLAT_STORE_BYTE : FLAT_Store_Helper <0x18, "flat_store_byte", VGPR_32>;
+def FLAT_STORE_SHORT : FLAT_Store_Helper <0x1a, "flat_store_short", VGPR_32>;
+def FLAT_STORE_DWORD : FLAT_Store_Helper <0x1c, "flat_store_dword", VGPR_32>;
+def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
+  0x1d, "flat_store_dwordx2", VReg_64
+>;
+def FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
+  0x1e, "flat_store_dwordx4", VReg_128
+>;
+def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
+  0x1f, "flat_store_dwordx3", VReg_96
+>;
+defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <0x30, "flat_atomic_swap", VGPR_32>;
+defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC <
+  0x31, "flat_atomic_cmpswap", VGPR_32, VReg_64
+>;
+defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <0x32, "flat_atomic_add", VGPR_32>;
+defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <0x33, "flat_atomic_sub", VGPR_32>;
+defm FLAT_ATOMIC_RSUB : FLAT_ATOMIC <0x34, "flat_atomic_rsub", VGPR_32>;
+defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <0x35, "flat_atomic_smin", VGPR_32>;
+defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <0x36, "flat_atomic_umin", VGPR_32>;
+defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <0x37, "flat_atomic_smax", VGPR_32>;
+defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <0x38, "flat_atomic_umax", VGPR_32>;
+defm FLAT_ATOMIC_AND : FLAT_ATOMIC <0x39, "flat_atomic_and", VGPR_32>;
+defm FLAT_ATOMIC_OR : FLAT_ATOMIC <0x3a, "flat_atomic_or", VGPR_32>;
+defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <0x3b, "flat_atomic_xor", VGPR_32>;
+defm FLAT_ATOMIC_INC : FLAT_ATOMIC <0x3c, "flat_atomic_inc", VGPR_32>;
+defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <0x3d, "flat_atomic_dec", VGPR_32>;
+defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
+  0x3e, "flat_atomic_fcmpswap", VGPR_32, VReg_64
+>;
+defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <0x3f, "flat_atomic_fmin", VGPR_32>;
+defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <0x40, "flat_atomic_fmax", VGPR_32>;
+defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <0x50, "flat_atomic_swap_x2", VReg_64>;
+defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC <
+  0x51, "flat_atomic_cmpswap_x2", VReg_64, VReg_128
+>;
+defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <0x52, "flat_atomic_add_x2", VReg_64>;
+defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <0x53, "flat_atomic_sub_x2", VReg_64>;
+defm FLAT_ATOMIC_RSUB_X2 : FLAT_ATOMIC <0x54, "flat_atomic_rsub_x2", VReg_64>;
+defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <0x55, "flat_atomic_smin_x2", VReg_64>;
+defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <0x56, "flat_atomic_umin_x2", VReg_64>;
+defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <0x57, "flat_atomic_smax_x2", VReg_64>;
+defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <0x58, "flat_atomic_umax_x2", VReg_64>;
+defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <0x59, "flat_atomic_and_x2", VReg_64>;
+defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <0x5a, "flat_atomic_or_x2", VReg_64>;
+defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <0x5b, "flat_atomic_xor_x2", VReg_64>;
+defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <0x5c, "flat_atomic_inc_x2", VReg_64>;
+defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <0x5d, "flat_atomic_dec_x2", VReg_64>;
+defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
+  0x5e, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128
+>;
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <0x5f, "flat_atomic_fmin_x2", VReg_64>;
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <0x60, "flat_atomic_fmax_x2", VReg_64>;
+
+} // End SubtargetPredicate = isCIVI
+
+//===----------------------------------------------------------------------===//
+// Flat Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasFlatAddressSpace] in {
+
+class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt,
+                             PatFrag flat_ld> :
+  Pat <(vt (flat_ld i64:$ptr)),
+       (Instr_ADDR64 $ptr, 0, 0, 0)
+>;
+
+def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>;
+
+class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> :
+  Pat <(st vt:$value, i64:$ptr),
+        (Instr $value, $ptr, 0, 0, 0)
+  >;
+
+def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>;
+def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>;
+def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
+
+} // End HasFlatAddressSpace predicate
+
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
new file mode 100644
index 00000000000..3e5ff1f3c6d
--- /dev/null
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -0,0 +1,64 @@
+set(LLVM_TARGET_DEFINITIONS AMDGPU.td)
+
+tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic)
+tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer)
+tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
+add_public_tablegen_target(AMDGPUCommonTableGen)
+
+add_llvm_target(AMDGPUCodeGen
+  AMDILCFGStructurizer.cpp
+  AMDGPUAlwaysInlinePass.cpp
+  AMDGPUAsmPrinter.cpp
+  AMDGPUFrameLowering.cpp
+  AMDGPUIntrinsicInfo.cpp
+  AMDGPUISelDAGToDAG.cpp
+  AMDGPUMCInstLower.cpp
+  AMDGPUMachineFunction.cpp
+  AMDGPUSubtarget.cpp
+  AMDGPUTargetMachine.cpp
+  AMDGPUTargetTransformInfo.cpp
+  AMDGPUISelLowering.cpp
+  AMDGPUInstrInfo.cpp
+  AMDGPUPromoteAlloca.cpp
+  AMDGPURegisterInfo.cpp
+  R600ClauseMergePass.cpp
+  R600ControlFlowFinalizer.cpp
+  R600EmitClauseMarkers.cpp
+  R600ExpandSpecialInstrs.cpp
+  R600InstrInfo.cpp
+  R600ISelLowering.cpp
+  R600MachineFunctionInfo.cpp
+  R600MachineScheduler.cpp
+  R600OptimizeVectorRegisters.cpp
+  R600Packetizer.cpp
+  R600RegisterInfo.cpp
+  R600TextureIntrinsicsReplacer.cpp
+  SIAnnotateControlFlow.cpp
+  SIFixControlFlowLiveIntervals.cpp
+  SIFixSGPRCopies.cpp
+  SIFixSGPRLiveRanges.cpp
+  SIFoldOperands.cpp
+  SIInsertWaits.cpp
+  SIInstrInfo.cpp
+  SIISelLowering.cpp
+  SILoadStoreOptimizer.cpp
+  SILowerControlFlow.cpp
+  SILowerI1Copies.cpp
+  SIMachineFunctionInfo.cpp
+  SIPrepareScratchRegs.cpp
+  SIRegisterInfo.cpp
+  SIShrinkInstructions.cpp
+  SITypeRewriter.cpp
+  )
+
+add_subdirectory(AsmParser)
+add_subdirectory(InstPrinter)
+add_subdirectory(TargetInfo)
+add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/AMDGPU/CaymanInstructions.td b/lib/Target/AMDGPU/CaymanInstructions.td
new file mode 100644
index 00000000000..ba4df82a6d3
--- /dev/null
+++ b/lib/Target/AMDGPU/CaymanInstructions.td
@@ -0,0 +1,226 @@
+//===-- CaymanInstructions.td - CM Instruction defs  -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are available only on Cayman
+// family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isCayman : Predicate<"Subtarget->hasCaymanISA()">;
+
+//===----------------------------------------------------------------------===//
+// Cayman Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isCayman] in {
+
+def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
+  [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU
+>;
+def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
+  [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU
+>;
+
+def : IMad24Pat<MULADD_INT24_cm>;
+
+let isVector = 1 in {
+
+def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
+
+def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
+def MULHI_INT_cm : MULHI_INT_Common<0x90>;
+def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
+def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_cm : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
+def SIN_cm : SIN_Common<0x8D>;
+def COS_cm : COS_Common<0x8E>;
+} // End isVector = 1
+
+def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
+
+def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
+
+defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
+defm : Expand24UBitOps<MULLO_UINT_cm, ADD_INT>;
+
+// RECIP_UINT emulation for Cayman
+// The multiplication scales from [0,1] to the unsigned integer range
+def : Pat <
+  (AMDGPUurecip i32:$src0),
+  (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)),
+                            (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1)))
+>;
+
+  def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
+    let ADDR = 0;
+    let POP_COUNT = 0;
+    let COUNT = 0;
+  }
+
+
+def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
+
+class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
+  CF_MEM_RAT_CACHELESS <0x14, 0, mask,
+                        (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
+                        "STORE_DWORD $rw_gpr, $index_gpr",
+                        [(global_store vt:$rw_gpr, i32:$index_gpr)]> {
+  let eop = 0; // This bit is not used on Cayman.
+}
+
+def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>;
+def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>;
+def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>;
+
+class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
+    : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
+
+  // Static fields
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let BUFFER_ID = buffer_id;
+  let SRC_REL = 0;
+  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
+  // to store vertex addresses in any channel, not just X.
+  let SRC_SEL_X = 0;
+  let SRC_SEL_Y = 0;
+  let STRUCTURED_READ = 0;
+  let LDS_REQ = 0;
+  let COALESCED_READ = 0;
+
+  let Inst{31-0} = Word0;
+}
+
+class VTX_READ_8_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 1; // FMT_8
+}
+
+class VTX_READ_16_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 5; // FMT_16
+
+}
+
+class VTX_READ_32_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 7;   // Masked
+  let DST_SEL_Z        = 7;   // Masked
+  let DST_SEL_W        = 7;   // Masked
+  let DATA_FORMAT      = 0xD; // COLOR_32
+
+  // This is not really necessary, but there were some GPU hangs that appeared
+  // to be caused by ALU instructions in the next instruction group that wrote
+  // to the $src_gpr registers of the VTX_READ.
+  // e.g.
+  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
+  // %T2_X<def> = MOV %ZERO
+  //Adding this constraint prevents this from happening.
+  let Constraints = "$src_gpr.ptr = $dst_gpr";
+}
+
+class VTX_READ_64_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_Reg64:$dst_gpr), pattern> {
+
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 7;
+  let DST_SEL_W        = 7;
+  let DATA_FORMAT      = 0x1D; // COLOR_32_32
+}
+
+class VTX_READ_128_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
+                   (outs R600_Reg128:$dst_gpr), pattern> {
+
+  let DST_SEL_X        =  0;
+  let DST_SEL_Y        =  1;
+  let DST_SEL_Z        =  2;
+  let DST_SEL_W        =  3;
+  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
+
+  // XXX: Need to force VTX_READ_128 instructions to write to the same register
+  // that holds its buffer address to avoid potential hangs.  We can't use
+  // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
+  // registers are different sizes.
+}
+
+//===----------------------------------------------------------------------===//
+// VTX Read from parameter memory space
+//===----------------------------------------------------------------------===//
+def VTX_READ_PARAM_8_cm : VTX_READ_8_cm <0,
+  [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_16_cm : VTX_READ_16_cm <0,
+  [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0,
+  [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0,
+  [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0,
+  [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from global memory space
+//===----------------------------------------------------------------------===//
+
+// 8-bit reads
+def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1,
+  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1,
+  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
+>;
+
+// 32-bit reads
+def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1,
+  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 64-bit reads
+def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1,
+  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 128-bit reads
+def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1,
+  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+} // End isCayman
+
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
new file mode 100644
index 00000000000..7adcd46fe19
--- /dev/null
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -0,0 +1,670 @@
+//===-- EvergreenInstructions.td - EG Instruction defs  ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are:
+// - Available to Evergreen and newer VLIW4/VLIW5 GPUs
+// - Available only on Evergreen family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isEG : Predicate<
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
+  "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+  "!Subtarget->hasCaymanISA()"
+>;
+
+def isEGorCayman : Predicate<
+  "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||"
+  "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS"
+>;
+
+//===----------------------------------------------------------------------===//
+// Evergreen / Cayman store instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEGorCayman] in {
+
+class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins,
+                           string name, list<dag> pattern>
+    : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins,
+                 "MEM_RAT_CACHELESS "#name, pattern>;
+
+class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
+                  list<dag> pattern>
+    : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
+                 "MEM_RAT "#name, pattern>;
+
+def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
+  "MSKOR $rw_gpr.XW, $index_gpr",
+  [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)]
+> {
+  let eop = 0;
+}
+
+} // End let Predicates = [isEGorCayman]
+
+//===----------------------------------------------------------------------===//
+// Evergreen Only instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEG] in {
+
+def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
+defm DIV_eg : DIV_Common<RECIP_IEEE_eg>;
+
+def MULLO_INT_eg : MULLO_INT_Common<0x8F>;
+def MULHI_INT_eg : MULHI_INT_Common<0x90>;
+def MULLO_UINT_eg : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_eg : MULHI_UINT_Common<0x92>;
+def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
+def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
+def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
+def SIN_eg : SIN_Common<0x8D>;
+def COS_eg : COS_Common<0x8E>;
+
+def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
+def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
+
+defm : Expand24IBitOps<MULLO_INT_eg, ADD_INT>;
+
+//===----------------------------------------------------------------------===//
+// Memory read/write instructions
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1 in {
+
+// 32-bit store
+def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1,
+  (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr, $index_gpr, $eop",
+  [(global_store i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+// 64-bit store
+def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3,
+  (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr.XY, $index_gpr, $eop",
+  [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+//128-bit store
+def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop",
+  [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+} // End usesCustomInserter = 1
+
+class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
+    : VTX_WORD0_eg, VTX_READ<name, buffer_id, outs, pattern> {
+
+  // Static fields
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let BUFFER_ID = buffer_id;
+  let SRC_REL = 0;
+  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
+  // to store vertex addresses in any channel, not just X.
+  let SRC_SEL_X = 0;
+
+  let Inst{31-0} = Word0;
+}
+
+class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 1;
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 1; // FMT_8
+}
+
+class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+  let MEGA_FETCH_COUNT = 2;
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 5; // FMT_16
+
+}
+
+class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 4;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 7;   // Masked
+  let DST_SEL_Z        = 7;   // Masked
+  let DST_SEL_W        = 7;   // Masked
+  let DATA_FORMAT      = 0xD; // COLOR_32
+
+  // This is not really necessary, but there were some GPU hangs that appeared
+  // to be caused by ALU instructions in the next instruction group that wrote
+  // to the $src_gpr registers of the VTX_READ.
+  // e.g.
+  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
+  // %T2_X<def> = MOV %ZERO
+  //Adding this constraint prevents this from happening.
+  let Constraints = "$src_gpr.ptr = $dst_gpr";
+}
+
+class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id,
+                   (outs R600_Reg64:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 8;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 7;
+  let DST_SEL_W        = 7;
+  let DATA_FORMAT      = 0x1D; // COLOR_32_32
+}
+
+class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
+                   (outs R600_Reg128:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 16;
+  let DST_SEL_X        =  0;
+  let DST_SEL_Y        =  1;
+  let DST_SEL_Z        =  2;
+  let DST_SEL_W        =  3;
+  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
+
+  // XXX: Need to force VTX_READ_128 instructions to write to the same register
+  // that holds its buffer address to avoid potential hangs.  We can't use
+  // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
+  // registers are different sizes.
+}
+
+//===----------------------------------------------------------------------===//
+// VTX Read from parameter memory space
+//===----------------------------------------------------------------------===//
+
+def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
+  [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
+  [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
+  [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
+  [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
+  [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from global memory space
+//===----------------------------------------------------------------------===//
+
+// 8-bit reads
+def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
+  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1,
+  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
+>;
+
+// 32-bit reads
+def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
+  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 64-bit reads
+def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
+  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 128-bit reads
+def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
+  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+} // End Predicates = [isEG]
+
+//===----------------------------------------------------------------------===//
+// Evergreen / Cayman Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEGorCayman] in {
+
+// Should be predicated on FeatureFP64
+// def FMA_64 : R600_3OP <
+//   0xA, "FMA_64",
+//   [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
+// >;
+
+// BFE_UINT - bit_extract, an optimization for mask and shift
+// Src0 = Input
+// Src1 = Offset
+// Src2 = Width
+//
+// bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width)
+//
+// Example Usage:
+// (Offset, Width)
+//
+// (0, 8)  = (Input << 24) >> 24 = (Input &  0xff)       >> 0
+// (8, 8)  = (Input << 16) >> 24 = (Input &  0xffff)     >> 8
+// (16, 8) = (Input <<  8) >> 24 = (Input &  0xffffff)   >> 16
+// (24, 8) = (Input <<  0) >> 24 = (Input &  0xffffffff) >> 24
+def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
+  [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))],
+  VecALU
+>;
+
+def BFE_INT_eg : R600_3OP <0x5, "BFE_INT",
+  [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))],
+  VecALU
+>;
+
+def : BFEPattern <BFE_UINT_eg, MOV_IMM_I32>;
+
+def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
+  [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))],
+  VecALU
+>;
+
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>;
+def : Pat<(i32 (sext_inreg i32:$src, i8)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>;
+def : Pat<(i32 (sext_inreg i32:$src, i16)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
+
+defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32, R600_Reg64>;
+
+def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
+  [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))],
+  VecALU
+>;
+
+def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
+  [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU
+>;
+
+def : UMad24Pat<MULADD_UINT24_eg>;
+
+def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
+def : ROTRPattern <BIT_ALIGN_INT_eg>;
+def MULADD_eg : MULADD_Common<0x14>;
+def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
+def FMA_eg : FMA_Common<0x7>;
+def ASHR_eg : ASHR_Common<0x15>;
+def LSHR_eg : LSHR_Common<0x16>;
+def LSHL_eg : LSHL_Common<0x17>;
+def CNDE_eg : CNDE_Common<0x19>;
+def CNDGT_eg : CNDGT_Common<0x1A>;
+def CNDGE_eg : CNDGE_Common<0x1B>;
+def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
+def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
+def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
+  [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU
+>;
+def DOT4_eg : DOT4_Common<0xBE>;
+defm CUBE_eg : CUBE_Common<0xC0>;
+
+def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
+
+def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
+def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
+
+def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>;
+def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
+
+let hasSideEffects = 1 in {
+  def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
+}
+
+def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
+
+def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
+  let Pattern = [];
+  let Itinerary = AnyALU;
+}
+
+def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
+
+def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
+  let Pattern = [];
+}
+
+def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
+
+def GROUP_BARRIER : InstR600 <
+    (outs), (ins), "  GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <0x54> {
+
+  let dst = 0;
+  let dst_rel = 0;
+  let src0 = 0;
+  let src0_rel = 0;
+  let src0_neg = 0;
+  let src0_abs = 0;
+  let src1 = 0;
+  let src1_rel = 0;
+  let src1_neg = 0;
+  let src1_abs = 0;
+  let write = 0;
+  let omod = 0;
+  let clamp = 0;
+  let last = 1;
+  let bank_swizzle = 0;
+  let pred_sel = 0;
+  let update_exec_mask = 0;
+  let update_pred = 0;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+
+  let ALUInst = 1;
+}
+
+def : Pat <
+	(int_AMDGPU_barrier_global),
+	(GROUP_BARRIER)
+>;
+
+//===----------------------------------------------------------------------===//
+// LDS Instructions
+//===----------------------------------------------------------------------===//
+class R600_LDS  <bits<6> op, dag outs, dag ins, string asm,
+                 list<dag> pattern = []> :
+
+    InstR600 <outs, ins, asm, pattern, XALU>,
+    R600_ALU_LDS_Word0,
+    R600LDS_Word1 {
+
+  bits<6>  offset = 0;
+  let lds_op = op;
+
+  let Word1{27} = offset{0};
+  let Word1{12} = offset{1};
+  let Word1{28} = offset{2};
+  let Word1{31} = offset{3};
+  let Word0{12} = offset{4};
+  let Word0{25} = offset{5};
+
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+
+  let ALUInst = 1;
+  let HasNativeOperands = 1;
+  let UseNamedOperandTable = 1;
+}
+
+class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
+  lds_op,
+  (outs R600_Reg32:$dst),
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       LAST:$last, R600_Pred:$pred_sel,
+       BANK_SWIZZLE:$bank_swizzle),
+  "  "#name#" $last OQAP, $src0$src0_rel $pred_sel",
+  pattern
+  > {
+
+  let src1 = 0;
+  let src1_rel = 0;
+  let src2 = 0;
+  let src2_rel = 0;
+
+  let usesCustomInserter = 1;
+  let LDS_1A = 1;
+  let DisableEncoding = "$dst";
+}
+
+class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
+                     string dst =""> :
+    R600_LDS <
+  lds_op, outs,
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+       LAST:$last, R600_Pred:$pred_sel,
+       BANK_SWIZZLE:$bank_swizzle),
+  "  "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel",
+  pattern
+  > {
+
+  field string BaseOp;
+
+  let src2 = 0;
+  let src2_rel = 0;
+  let LDS_1A1D = 1;
+}
+
+class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A1D <lds_op, (outs), name, pattern> {
+  let BaseOp = name;
+}
+
+class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A1D <lds_op,  (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> {
+
+  let BaseOp = name;
+  let usesCustomInserter = 1;
+  let DisableEncoding = "$dst";
+}
+
+class R600_LDS_1A2D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
+                     string dst =""> :
+    R600_LDS <
+  lds_op, outs,
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+       R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel,
+       LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle),
+  "  "#name# "$last "#dst#"$src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel",
+  pattern> {
+
+  field string BaseOp;
+
+  let LDS_1A1D = 0;
+  let LDS_1A2D = 1;
+}
+
+class R600_LDS_1A2D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A2D <lds_op, (outs), name, pattern> {
+  let BaseOp = name;
+}
+
+class R600_LDS_1A2D_RET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A2D <lds_op, (outs R600_Reg32:$dst), name, pattern> {
+
+  let BaseOp = name;
+  let usesCustomInserter = 1;
+  let DisableEncoding = "$dst";
+}
+
+def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
+def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >;
+def LDS_AND : R600_LDS_1A1D_NORET <0x9, "LDS_AND", [] >;
+def LDS_OR : R600_LDS_1A1D_NORET <0xa, "LDS_OR", [] >;
+def LDS_XOR : R600_LDS_1A1D_NORET <0xb, "LDS_XOR", [] >;
+def LDS_WRXCHG: R600_LDS_1A1D_NORET <0xd, "LDS_WRXCHG", [] >;
+def LDS_CMPST: R600_LDS_1A2D_NORET <0x10, "LDS_CMPST", [] >;
+def LDS_MIN_INT : R600_LDS_1A1D_NORET <0x5, "LDS_MIN_INT", [] >;
+def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >;
+def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >;
+def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >;
+def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE",
+  [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
+>;
+def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE",
+  [(truncstorei8_local i32:$src1, i32:$src0)]
+>;
+def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
+  [(truncstorei16_local i32:$src1, i32:$src0)]
+>;
+def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
+  [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))]
+>;
+def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
+  [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
+>;
+def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND",
+  [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))]
+>;
+def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR",
+  [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))]
+>;
+def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR",
+  [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))]
+>;
+def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT",
+  [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))]
+>;
+def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT",
+  [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))]
+>;
+def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT",
+  [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))]
+>;
+def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT",
+  [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))]
+>;
+def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG",
+  [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))]
+>;
+def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST",
+  [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))]
+>;
+def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
+  [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
+>;
+def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET",
+  [(set i32:$dst, (sextloadi8_local i32:$src0))]
+>;
+def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET",
+  [(set i32:$dst, (az_extloadi8_local i32:$src0))]
+>;
+def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET",
+  [(set i32:$dst, (sextloadi16_local i32:$src0))]
+>;
+def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET",
+  [(set i32:$dst, (az_extloadi16_local i32:$src0))]
+>;
+
+// TRUNC is used for the FLT_TO_INT instructions to work around a
+// perceived problem where the rounding modes are applied differently
+// depending on the instruction and the slot they are in.
+// See:
+// https://bugs.freedesktop.org/show_bug.cgi?id=50232
+// Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c
+//
+// XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes,
+// which do not need to be truncated since the fp values are 0.0f or 1.0f.
+// We should look into handling these cases separately.
+def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
+
+def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
+
+// SHA-256 Patterns
+def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
+
+def EG_ExportSwz : ExportSwzInst {
+  let Word1{19-16} = 0; // BURST_COUNT
+  let Word1{20} = 0; // VALID_PIXEL_MODE
+  let Word1{21} = eop;
+  let Word1{29-22} = inst;
+  let Word1{30} = 0; // MARK
+  let Word1{31} = 1; // BARRIER
+}
+defm : ExportPattern<EG_ExportSwz, 83>;
+
+def EG_ExportBuf : ExportBufInst {
+  let Word1{19-16} = 0; // BURST_COUNT
+  let Word1{20} = 0; // VALID_PIXEL_MODE
+  let Word1{21} = eop;
+  let Word1{29-22} = inst;
+  let Word1{30} = 0; // MARK
+  let Word1{31} = 1; // BARRIER
+}
+defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>;
+
+def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT),
+  "TEX $COUNT @$ADDR"> {
+  let POP_COUNT = 0;
+}
+def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT),
+  "VTX $COUNT @$ADDR"> {
+  let POP_COUNT = 0;
+}
+def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR),
+  "LOOP_START_DX10 @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR),
+  "LOOP_BREAK @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR),
+  "CONTINUE @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "JUMP @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_PUSH_EG : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+                              "PUSH @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "ELSE @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> {
+  let ADDR = 0;
+  let COUNT = 0;
+  let POP_COUNT = 0;
+}
+def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "POP @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_END_EG :  CF_CLAUSE_EG<0, (ins), "CF_END"> {
+  let COUNT = 0;
+  let POP_COUNT = 0;
+  let ADDR = 0;
+  let END_OF_PROGRAM = 1;
+}
+
+} // End Predicates = [isEGorCayman]
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
new file mode 100644
index 00000000000..e811d5cff22
--- /dev/null
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -0,0 +1,642 @@
+//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                  StringRef Annot, const MCSubtargetInfo &STI) {
+  OS.flush();
+  printInstruction(MI, OS);
+
+  printAnnotation(OS, Annot);
+}
+
+void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xff);
+}
+
+void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff);
+}
+
+void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
+}
+
+void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  O << formatDec(MI->getOperand(OpNo).getImm() & 0xff);
+}
+
+void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &O) {
+  O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
+}
+
+void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " offen";
+}
+
+void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " idxen";
+}
+
+void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " addr64";
+}
+
+void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " offset:";
+    printU16ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  uint16_t Imm = MI->getOperand(OpNo).getImm();
+  if (Imm != 0) {
+    O << " offset:";
+    printU16ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " offset0:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " offset1:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " gds";
+}
+
+void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " glc";
+}
+
+void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " slc";
+}
+
+void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " tfe";
+}
+
+void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O,
+                                        const MCRegisterInfo &MRI) {
+  switch (reg) {
+  case AMDGPU::VCC:
+    O << "vcc";
+    return;
+  case AMDGPU::SCC:
+    O << "scc";
+    return;
+  case AMDGPU::EXEC:
+    O << "exec";
+    return;
+  case AMDGPU::M0:
+    O << "m0";
+    return;
+  case AMDGPU::FLAT_SCR:
+    O << "flat_scratch";
+    return;
+  case AMDGPU::VCC_LO:
+    O << "vcc_lo";
+    return;
+  case AMDGPU::VCC_HI:
+    O << "vcc_hi";
+    return;
+  case AMDGPU::EXEC_LO:
+    O << "exec_lo";
+    return;
+  case AMDGPU::EXEC_HI:
+    O << "exec_hi";
+    return;
+  case AMDGPU::FLAT_SCR_LO:
+    O << "flat_scratch_lo";
+    return;
+  case AMDGPU::FLAT_SCR_HI:
+    O << "flat_scratch_hi";
+    return;
+  default:
+    break;
+  }
+
+  char Type;
+  unsigned NumRegs;
+
+  if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 1;
+  } else  if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 1;
+  } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 2;
+  } else  if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 2;
+  } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 4;
+  } else  if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 4;
+  } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 3;
+  } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 8;
+  } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 8;
+  } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 16;
+  } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 16;
+  } else {
+    O << getRegisterName(reg);
+    return;
+  }
+
+  // The low 8 bits of the encoding value is the register index, for both VGPRs
+  // and SGPRs.
+  unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1);
+  if (NumRegs == 1) {
+    O << Type << RegIdx;
+    return;
+  }
+
+  O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
+}
+
+void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3)
+    O << "_e64 ";
+  else
+    O << "_e32 ";
+
+  printOperand(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) {
+  int32_t SImm = static_cast<int32_t>(Imm);
+  if (SImm >= -16 && SImm <= 64) {
+    O << SImm;
+    return;
+  }
+
+  if (Imm == FloatToBits(0.0f))
+    O << "0.0";
+  else if (Imm == FloatToBits(1.0f))
+    O << "1.0";
+  else if (Imm == FloatToBits(-1.0f))
+    O << "-1.0";
+  else if (Imm == FloatToBits(0.5f))
+    O << "0.5";
+  else if (Imm == FloatToBits(-0.5f))
+    O << "-0.5";
+  else if (Imm == FloatToBits(2.0f))
+    O << "2.0";
+  else if (Imm == FloatToBits(-2.0f))
+    O << "-2.0";
+  else if (Imm == FloatToBits(4.0f))
+    O << "4.0";
+  else if (Imm == FloatToBits(-4.0f))
+    O << "-4.0";
+  else
+    O << formatHex(static_cast<uint64_t>(Imm));
+}
+
+void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) {
+  int64_t SImm = static_cast<int64_t>(Imm);
+  if (SImm >= -16 && SImm <= 64) {
+    O << SImm;
+    return;
+  }
+
+  if (Imm == DoubleToBits(0.0))
+    O << "0.0";
+  else if (Imm == DoubleToBits(1.0))
+    O << "1.0";
+  else if (Imm == DoubleToBits(-1.0))
+    O << "-1.0";
+  else if (Imm == DoubleToBits(0.5))
+    O << "0.5";
+  else if (Imm == DoubleToBits(-0.5))
+    O << "-0.5";
+  else if (Imm == DoubleToBits(2.0))
+    O << "2.0";
+  else if (Imm == DoubleToBits(-2.0))
+    O << "-2.0";
+  else if (Imm == DoubleToBits(4.0))
+    O << "4.0";
+  else if (Imm == DoubleToBits(-4.0))
+    O << "-4.0";
+  else
+    llvm_unreachable("64-bit literal constants not supported");
+}
+
+void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    switch (Op.getReg()) {
+    // This is the default predicate state, so we don't need to print it.
+    case AMDGPU::PRED_SEL_OFF:
+      break;
+
+    default:
+      printRegOperand(Op.getReg(), O, MRI);
+      break;
+    }
+  } else if (Op.isImm()) {
+    const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+    int RCID = Desc.OpInfo[OpNo].RegClass;
+    if (RCID != -1) {
+      const MCRegisterClass &ImmRC = MRI.getRegClass(RCID);
+      if (ImmRC.getSize() == 4)
+        printImmediate32(Op.getImm(), O);
+      else if (ImmRC.getSize() == 8)
+        printImmediate64(Op.getImm(), O);
+      else
+        llvm_unreachable("Invalid register class size");
+    } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) {
+      printImmediate32(Op.getImm(), O);
+    } else {
+      // We hit this for the immediate instruction bits that don't yet have a
+      // custom printer.
+      // TODO: Eventually this should be unnecessary.
+      O << formatDec(Op.getImm());
+    }
+  } else if (Op.isFPImm()) {
+    // We special case 0.0 because otherwise it will be printed as an integer.
+    if (Op.getFPImm() == 0.0)
+      O << "0.0";
+    else {
+      const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+      const MCRegisterClass &ImmRC = MRI.getRegClass(Desc.OpInfo[OpNo].RegClass);
+
+      if (ImmRC.getSize() == 4)
+        printImmediate32(FloatToBits(Op.getFPImm()), O);
+      else if (ImmRC.getSize() == 8)
+        printImmediate64(DoubleToBits(Op.getFPImm()), O);
+      else
+        llvm_unreachable("Invalid register class size");
+    }
+  } else if (Op.isExpr()) {
+    const MCExpr *Exp = Op.getExpr();
+    Exp->print(O, &MAI);
+  } else {
+    llvm_unreachable("unknown operand type in printOperand");
+  }
+}
+
+void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  unsigned InputModifiers = MI->getOperand(OpNo).getImm();
+  if (InputModifiers & SISrcMods::NEG)
+    O << '-';
+  if (InputModifiers & SISrcMods::ABS)
+    O << '|';
+  printOperand(MI, OpNo + 1, O);
+  if (InputModifiers & SISrcMods::ABS)
+    O << '|';
+}
+
+void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+
+  if (Imm == 2) {
+    O << "P0";
+  } else if (Imm == 1) {
+    O << "P20";
+  } else if (Imm == 0) {
+    O << "P10";
+  } else {
+    llvm_unreachable("Invalid interpolation parameter slot");
+  }
+}
+
+void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  printOperand(MI, OpNo, O);
+  O  << ", ";
+  printOperand(MI, OpNo + 1, O);
+}
+
+void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O, StringRef Asm,
+                                   StringRef Default) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm());
+  if (Op.getImm() == 1) {
+    O << Asm;
+  } else {
+    O << Default;
+  }
+}
+
+void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "|");
+}
+
+void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "_SAT");
+}
+
+void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " clamp";
+}
+
+void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  int Imm = MI->getOperand(OpNo).getImm();
+  if (Imm == SIOutMods::MUL2)
+    O << " mul:2";
+  else if (Imm == SIOutMods::MUL4)
+    O << " mul:4";
+  else if (Imm == SIOutMods::DIV2)
+    O << " div:2";
+}
+
+void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  int32_t Imm = MI->getOperand(OpNo).getImm();
+  O << Imm << '(' << BitsToFloat(Imm) << ')';
+}
+
+void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "*", " ");
+}
+
+void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "-");
+}
+
+void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  switch (MI->getOperand(OpNo).getImm()) {
+  default: break;
+  case 1:
+    O << " * 2.0";
+    break;
+  case 2:
+    O << " * 4.0";
+    break;
+  case 3:
+    O << " / 2.0";
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "+");
+}
+
+void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "ExecMask,");
+}
+
+void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "Pred,");
+}
+
+void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.getImm() == 0) {
+    O << " (MASKED)";
+  }
+}
+
+void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  const char * chans = "XYZW";
+  int sel = MI->getOperand(OpNo).getImm();
+
+  int chan = sel & 3;
+  sel >>= 2;
+
+  if (sel >= 512) {
+    sel -= 512;
+    int cb = sel >> 12;
+    sel &= 4095;
+    O << cb << '[' << sel << ']';
+  } else if (sel >= 448) {
+    sel -= 448;
+    O << sel;
+  } else if (sel >= 0){
+    O << sel;
+  }
+
+  if (sel >= 0)
+    O << '.' << chans[chan];
+}
+
+void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  int BankSwizzle = MI->getOperand(OpNo).getImm();
+  switch (BankSwizzle) {
+  case 1:
+    O << "BS:VEC_021/SCL_122";
+    break;
+  case 2:
+    O << "BS:VEC_120/SCL_212";
+    break;
+  case 3:
+    O << "BS:VEC_102/SCL_221";
+    break;
+  case 4:
+    O << "BS:VEC_201";
+    break;
+  case 5:
+    O << "BS:VEC_210";
+    break;
+  default:
+    break;
+  }
+  return;
+}
+
+void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  unsigned Sel = MI->getOperand(OpNo).getImm();
+  switch (Sel) {
+  case 0:
+    O << 'X';
+    break;
+  case 1:
+    O << 'Y';
+    break;
+  case 2:
+    O << 'Z';
+    break;
+  case 3:
+    O << 'W';
+    break;
+  case 4:
+    O << '0';
+    break;
+  case 5:
+    O << '1';
+    break;
+  case 7:
+    O << '_';
+    break;
+  default:
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  unsigned CT = MI->getOperand(OpNo).getImm();
+  switch (CT) {
+  case 0:
+    O << 'U';
+    break;
+  case 1:
+    O << 'N';
+    break;
+  default:
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  int KCacheMode = MI->getOperand(OpNo).getImm();
+  if (KCacheMode > 0) {
+    int KCacheBank = MI->getOperand(OpNo - 2).getImm();
+    O << "CB" << KCacheBank << ':';
+    int KCacheAddr = MI->getOperand(OpNo + 2).getImm();
+    int LineSize = (KCacheMode == 1) ? 16 : 32;
+    O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize;
+  }
+}
+
+void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  unsigned Msg = SImm16 & 0xF;
+  if (Msg == 2 || Msg == 3) {
+    unsigned Op = (SImm16 >> 4) & 0xF;
+    if (Msg == 3)
+      O << "Gs_done(";
+    else
+      O << "Gs(";
+    if (Op == 0) {
+      O << "nop";
+    } else {
+      unsigned Stream = (SImm16 >> 8) & 0x3;
+      if (Op == 1)
+	O << "cut";
+      else if (Op == 2)
+	O << "emit";
+      else if (Op == 3)
+	O << "emit-cut";
+      O << " stream " << Stream;
+    }
+    O << "), [m0] ";
+  } else if (Msg == 1)
+    O << "interrupt ";
+  else if (Msg == 15)
+    O << "system ";
+  else
+    O << "unknown(" << Msg << ") ";
+}
+
+void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs
+  // SIInsertWaits.cpp bits usage does not match ISA docs description but it
+  // works so it might be a misprint in docs.
+  unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  unsigned Vmcnt = SImm16 & 0xF;
+  unsigned Expcnt = (SImm16 >> 4) & 0xF;
+  unsigned Lgkmcnt = (SImm16 >> 8) & 0xF;
+
+  bool NeedSpace = false;
+
+  if (Vmcnt != 0xF) {
+    O << "vmcnt(" << Vmcnt << ')';
+    NeedSpace = true;
+  }
+
+  if (Expcnt != 0x7) {
+    if (NeedSpace)
+      O << ' ';
+    O << "expcnt(" << Expcnt << ')';
+    NeedSpace = true;
+  }
+
+  if (Lgkmcnt != 0x7) {
+    if (NeedSpace)
+      O << ' ';
+    O << "lgkmcnt(" << Lgkmcnt << ')';
+  }
+}
+
+#include "AMDGPUGenAsmWriter.inc"
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
new file mode 100644
index 00000000000..14fb511e923
--- /dev/null
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -0,0 +1,88 @@
+//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
+#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class AMDGPUInstPrinter : public MCInstPrinter {
+public:
+  AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                     const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  //Autogenerated by tblgen
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+  static void printRegOperand(unsigned RegNo, raw_ostream &O,
+                              const MCRegisterInfo &MRI);
+
+private:
+  void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRegOperand(unsigned RegNo, raw_ostream &O);
+  void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printImmediate32(uint32_t I, raw_ostream &O);
+  void printImmediate64(uint64_t I, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                         StringRef Asm, StringRef Default = "");
+  static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O);
+  static void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt b/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt
new file mode 100644
index 00000000000..ce63bd553b9
--- /dev/null
+++ b/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMAMDGPUAsmPrinter
+  AMDGPUInstPrinter.cpp
+  )
diff --git a/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt b/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt
new file mode 100644
index 00000000000..fdb43844dc6
--- /dev/null
+++ b/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AMDGPUAsmPrinter
+parent = AMDGPU
+required_libraries = MC Support
+add_to_library_groups = AMDGPU
+
diff --git a/lib/Target/AMDGPU/InstPrinter/Makefile b/lib/Target/AMDGPU/InstPrinter/Makefile
new file mode 100644
index 00000000000..a794cc1124e
--- /dev/null
+++ b/lib/Target/AMDGPU/InstPrinter/Makefile
@@ -0,0 +1,15 @@
+#===- lib/Target/R600/AsmPrinter/Makefile ------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMR600AsmPrinter
+
+# Hack: we need to include 'main' x86 target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/LLVMBuild.txt b/lib/Target/AMDGPU/LLVMBuild.txt
new file mode 100644
index 00000000000..c6861df91ed
--- /dev/null
+++ b/lib/Target/AMDGPU/LLVMBuild.txt
@@ -0,0 +1,33 @@
+;===- ./lib/Target/AMDIL/LLVMBuild.txt -------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo
+
+[component_0]
+type = TargetGroup
+name = AMDGPU
+parent = Target
+has_asmparser = 1
+has_asmprinter = 1
+
+[component_1]
+type = Library
+name = AMDGPUCodeGen
+parent = AMDGPU
+required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmParser AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo Scalar SelectionDAG Support Target TransformUtils
+add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
new file mode 100644
index 00000000000..8bed2deef4c
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -0,0 +1,144 @@
+//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUMCObjectWriter : public MCObjectWriter {
+public:
+  AMDGPUMCObjectWriter(raw_pwrite_stream &OS) : MCObjectWriter(OS, true) {}
+  void executePostLayoutBinding(MCAssembler &Asm,
+                                const MCAsmLayout &Layout) override {
+    //XXX: Implement if necessary.
+  }
+  void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, bool &IsPCRel,
+                        uint64_t &FixedValue) override {
+    assert(!"Not implemented");
+  }
+
+  void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
+
+};
+
+class AMDGPUAsmBackend : public MCAsmBackend {
+public:
+  AMDGPUAsmBackend(const Target &T)
+    : MCAsmBackend() {}
+
+  unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
+    assert(!"Not implemented");
+  }
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+};
+
+} //End anonymous namespace
+
+void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm,
+                                       const MCAsmLayout &Layout) {
+  for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) {
+    Asm.writeSectionData(&*I, Layout);
+  }
+}
+
+void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                  unsigned DataSize, uint64_t Value,
+                                  bool IsPCRel) const {
+
+  switch ((unsigned)Fixup.getKind()) {
+    default: llvm_unreachable("Unknown fixup kind");
+    case AMDGPU::fixup_si_sopp_br: {
+      uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
+      *Dst = (Value - 4) / 4;
+      break;
+    }
+
+    case AMDGPU::fixup_si_rodata: {
+      uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
+      *Dst = Value;
+      break;
+    }
+
+    case AMDGPU::fixup_si_end_of_text: {
+      uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
+      // The value points to the last instruction in the text section, so we
+      // need to add 4 bytes to get to the start of the constants.
+      *Dst = Value + 4;
+      break;
+    }
+  }
+}
+
+const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
+                                                       MCFixupKind Kind) const {
+  const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
+    // name                   offset bits  flags
+    { "fixup_si_sopp_br",     0,     16,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_si_rodata",      0,     32,   0 },
+    { "fixup_si_end_of_text", 0,     32,   MCFixupKindInfo::FKF_IsPCRel }
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
+bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  OW->WriteZeros(Count);
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// ELFAMDGPUAsmBackend class
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
+public:
+  ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { }
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createAMDGPUELFObjectWriter(OS);
+  }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           const Triple &TT, StringRef CPU) {
+  return new ELFAMDGPUAsmBackend(T);
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
new file mode 100644
index 00000000000..59f45ff02d8
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -0,0 +1,39 @@
+//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  AMDGPUELFObjectWriter();
+protected:
+  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsPCRel) const override {
+    return Fixup.getKind();
+  }
+
+};
+
+
+} // End anonymous namespace
+
+AMDGPUELFObjectWriter::AMDGPUELFObjectWriter()
+  : MCELFObjectTargetWriter(false, 0, 0, false) { }
+
+MCObjectWriter *llvm::createAMDGPUELFObjectWriter(raw_pwrite_stream &OS) {
+  MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter();
+  return createELFObjectWriter(MOTW, OS, true);
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
new file mode 100644
index 00000000000..01021d67ffd
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
@@ -0,0 +1,34 @@
+//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace AMDGPU {
+enum Fixups {
+  /// 16-bit PC relative fixup for SOPP branch instructions.
+  fixup_si_sopp_br = FirstTargetFixupKind,
+
+  /// fixup for global addresses with constant initializers
+  fixup_si_rodata,
+
+  /// fixup for offset from instruction to end of text section
+  fixup_si_end_of_text,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
new file mode 100644
index 00000000000..028a86dfc7a
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -0,0 +1,43 @@
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCAsmInfo.h"
+
+using namespace llvm;
+AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
+  HasSingleParameterDotFile = false;
+  //===------------------------------------------------------------------===//
+  MaxInstLength = 16;
+  SeparatorString = "\n";
+  CommentString = ";";
+  PrivateLabelPrefix = "";
+  InlineAsmStart = ";#ASMSTART";
+  InlineAsmEnd = ";#ASMEND";
+
+  //===--- Data Emission Directives -------------------------------------===//
+  ZeroDirective = ".zero";
+  AsciiDirective = ".ascii\t";
+  AscizDirective = ".asciz\t";
+  Data8bitsDirective = ".byte\t";
+  Data16bitsDirective = ".short\t";
+  Data32bitsDirective = ".long\t";
+  Data64bitsDirective = ".quad\t";
+  SunStyleELFSectionSwitchSyntax = true;
+  UsesELFSectionDirectiveForBSS = true;
+
+  //===--- Global Variable Emission Directives --------------------------===//
+  HasAggressiveSymbolFolding = true;
+  COMMDirectiveAlignmentIsInBytes = false;
+  HasDotTypeDotSizeDirective = false;
+  HasNoDeadStrip = true;
+  WeakRefDirective = ".weakref\t";
+  //===--- Dwarf Emission Directives -----------------------------------===//
+  SupportsDebugInformation = true;
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
new file mode 100644
index 00000000000..a5bac51e356
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -0,0 +1,32 @@
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+namespace llvm {
+
+class Triple;
+
+// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo,
+// you will need to make sure your new class sets PrivateGlobalPrefix to
+// a prefix that won't appeary in a fuction name.  The default value
+// for PrivateGlobalPrefix is 'L', so it will consider any function starting
+// with 'L' as a local symbol.
+class AMDGPUMCAsmInfo : public MCAsmInfoELF {
+public:
+  explicit AMDGPUMCAsmInfo(const Triple &TT);
+};
+} // namespace llvm
+#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
new file mode 100644
index 00000000000..521b3b39bba
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -0,0 +1,21 @@
+//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief CodeEmitter interface for R600 and SI codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCCodeEmitter.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+void AMDGPUMCCodeEmitter::anchor() {}
+
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
new file mode 100644
index 00000000000..c9574276223
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -0,0 +1,50 @@
+//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief CodeEmitter interface for R600 and SI codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class MCInst;
+class MCOperand;
+class MCSubtargetInfo;
+
+class AMDGPUMCCodeEmitter : public MCCodeEmitter {
+  virtual void anchor();
+public:
+
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+    return 0;
+  }
+
+  virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+    return 0;
+  }
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
new file mode 100644
index 00000000000..a7d3dd1345f
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -0,0 +1,90 @@
+//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This file provides AMDGPU specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "AMDGPUMCAsmInfo.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "SIDefines.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "AMDGPUGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "AMDGPUGenRegisterInfo.inc"
+
+static MCInstrInfo *createAMDGPUMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitAMDGPUMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAMDGPUMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCSubtargetInfo *
+createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  MCSubtargetInfo * X = new MCSubtargetInfo();
+  InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->initMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T,
+                                                unsigned SyntaxVariant,
+                                                const MCAsmInfo &MAI,
+                                                const MCInstrInfo &MII,
+                                                const MCRegisterInfo &MRI) {
+  return new AMDGPUInstPrinter(MAI, MII, MRI);
+}
+
+extern "C" void LLVMInitializeAMDGPUTargetMC() {
+  for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) {
+    RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
+
+    TargetRegistry::RegisterMCCodeGenInfo(*T, createAMDGPUMCCodeGenInfo);
+    TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo);
+    TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo);
+    TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
+    TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter);
+    TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend);
+  }
+
+  TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget,
+                                        createR600MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createSIMCCodeEmitter);
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
new file mode 100644
index 00000000000..92e29dc7037
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -0,0 +1,61 @@
+//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Provides AMDGPU specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+class Triple;
+class raw_pwrite_stream;
+class raw_ostream;
+
+extern Target TheAMDGPUTarget;
+extern Target TheGCNTarget;
+
+MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
+                                       const MCRegisterInfo &MRI,
+                                       MCContext &Ctx);
+
+MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
+                                     const MCRegisterInfo &MRI,
+                                     MCContext &Ctx);
+
+MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     const Triple &TT, StringRef CPU);
+
+MCObjectWriter *createAMDGPUELFObjectWriter(raw_pwrite_stream &OS);
+} // End llvm namespace
+
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+
+#define GET_INSTRINFO_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 00000000000..151d0d5f83d
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+add_llvm_library(LLVMAMDGPUDesc
+  AMDGPUAsmBackend.cpp
+  AMDGPUELFObjectWriter.cpp
+  AMDGPUMCCodeEmitter.cpp
+  AMDGPUMCTargetDesc.cpp
+  AMDGPUMCAsmInfo.cpp
+  R600MCCodeEmitter.cpp
+  SIMCCodeEmitter.cpp
+  )
diff --git a/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 00000000000..4217bb36297
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt -------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AMDGPUDesc
+parent = AMDGPU
+required_libraries = MC AMDGPUAsmPrinter AMDGPUInfo Support
+add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/MCTargetDesc/Makefile b/lib/Target/AMDGPU/MCTargetDesc/Makefile
new file mode 100644
index 00000000000..8894a7607f4
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/AMDGPU/TargetDesc/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMR600Desc
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
new file mode 100644
index 00000000000..e683498d52a
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -0,0 +1,181 @@
+//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// \brief The R600 code emitter produces machine code that can be executed
+/// directly on the GPU device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600Defines.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
+  R600MCCodeEmitter(const R600MCCodeEmitter &) = delete;
+  void operator=(const R600MCCodeEmitter &) = delete;
+  const MCInstrInfo &MCII;
+  const MCRegisterInfo &MRI;
+
+public:
+
+  R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
+    : MCII(mcii), MRI(mri) { }
+
+  /// \brief Encode the instruction and write it to the OS.
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  /// \returns the encoding for an MCOperand.
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
+private:
+
+  void EmitByte(unsigned int byte, raw_ostream &OS) const;
+
+  void Emit(uint32_t value, raw_ostream &OS) const;
+  void Emit(uint64_t value, raw_ostream &OS) const;
+
+  unsigned getHWRegChan(unsigned reg) const;
+  unsigned getHWReg(unsigned regNo) const;
+
+};
+
+} // End anonymous namespace
+
+enum RegElement {
+  ELEMENT_X = 0,
+  ELEMENT_Y,
+  ELEMENT_Z,
+  ELEMENT_W
+};
+
+enum FCInstr {
+  FC_IF_PREDICATE = 0,
+  FC_ELSE,
+  FC_ENDIF,
+  FC_BGNLOOP,
+  FC_ENDLOOP,
+  FC_BREAK_PREDICATE,
+  FC_CONTINUE
+};
+
+MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
+                                             const MCRegisterInfo &MRI,
+					     MCContext &Ctx) {
+  return new R600MCCodeEmitter(MCII, MRI);
+}
+
+void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  if (MI.getOpcode() == AMDGPU::RETURN ||
+    MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
+    MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
+    MI.getOpcode() == AMDGPU::BUNDLE ||
+    MI.getOpcode() == AMDGPU::KILL) {
+    return;
+  } else if (IS_VTX(Desc)) {
+    uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI);
+    uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
+    if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) {
+      InstWord2 |= 1 << 19; // Mega-Fetch bit
+    }
+
+    Emit(InstWord01, OS);
+    Emit(InstWord2, OS);
+    Emit((uint32_t) 0, OS);
+  } else if (IS_TEX(Desc)) {
+      int64_t Sampler = MI.getOperand(14).getImm();
+
+      int64_t SrcSelect[4] = {
+        MI.getOperand(2).getImm(),
+        MI.getOperand(3).getImm(),
+        MI.getOperand(4).getImm(),
+        MI.getOperand(5).getImm()
+      };
+      int64_t Offsets[3] = {
+        MI.getOperand(6).getImm() & 0x1F,
+        MI.getOperand(7).getImm() & 0x1F,
+        MI.getOperand(8).getImm() & 0x1F
+      };
+
+      uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups, STI);
+      uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 |
+          SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 |
+          SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 |
+          Offsets[2] << 10;
+
+      Emit(Word01, OS);
+      Emit(Word2, OS);
+      Emit((uint32_t) 0, OS);
+  } else {
+    uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI);
+    if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) &&
+       ((Desc.TSFlags & R600_InstFlag::OP1) ||
+         Desc.TSFlags & R600_InstFlag::OP2)) {
+      uint64_t ISAOpCode = Inst & (0x3FFULL << 39);
+      Inst &= ~(0x3FFULL << 39);
+      Inst |= ISAOpCode << 1;
+    }
+    Emit(Inst, OS);
+  }
+}
+
+void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const {
+  OS.write((uint8_t) Byte & 0xff);
+}
+
+void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
+  support::endian::Writer<support::little>(OS).write(Value);
+}
+
+void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
+  support::endian::Writer<support::little>(OS).write(Value);
+}
+
+unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const {
+  return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT;
+}
+
+unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
+  return MRI.getEncodingValue(RegNo) & HW_REG_MASK;
+}
+
+uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                              const MCOperand &MO,
+                                        SmallVectorImpl<MCFixup> &Fixup,
+                                        const MCSubtargetInfo &STI) const {
+  if (MO.isReg()) {
+    if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags))
+      return MRI.getEncodingValue(MO.getReg());
+    return getHWReg(MO.getReg());
+  }
+
+  assert(MO.isImm());
+  return MO.getImm();
+}
+
+#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
new file mode 100644
index 00000000000..65a0eeba2b1
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -0,0 +1,289 @@
+//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The SI code emitter produces machine code that can be executed
+/// directly on the GPU device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
+  SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
+  void operator=(const SIMCCodeEmitter &) = delete;
+  const MCInstrInfo &MCII;
+  const MCRegisterInfo &MRI;
+  MCContext &Ctx;
+
+  /// \brief Can this operand also contain immediate values?
+  bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
+
+  /// \brief Encode an fp or int literal
+  uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize) const;
+
+public:
+  SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
+                  MCContext &ctx)
+    : MCII(mcii), MRI(mri), Ctx(ctx) { }
+
+  ~SIMCCodeEmitter() override {}
+
+  /// \brief Encode the instruction and write it to the OS.
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  /// \returns the encoding for an MCOperand.
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
+
+  /// \brief Use a fixup to encode the simm16 field for SOPP branch
+  ///        instructions.
+  unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
+};
+
+} // End anonymous namespace
+
+MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
+                                           const MCRegisterInfo &MRI,
+                                           MCContext &Ctx) {
+  return new SIMCCodeEmitter(MCII, MRI, Ctx);
+}
+
+bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc,
+                                   unsigned OpNo) const {
+  unsigned OpType = Desc.OpInfo[OpNo].OperandType;
+
+  return OpType == AMDGPU::OPERAND_REG_IMM32 ||
+         OpType == AMDGPU::OPERAND_REG_INLINE_C;
+}
+
+// Returns the encoding value to use if the given integer is an integer inline
+// immediate value, or 0 if it is not.
+template <typename IntTy>
+static uint32_t getIntInlineImmEncoding(IntTy Imm) {
+  if (Imm >= 0 && Imm <= 64)
+    return 128 + Imm;
+
+  if (Imm >= -16 && Imm <= -1)
+    return 192 + std::abs(Imm);
+
+  return 0;
+}
+
+static uint32_t getLit32Encoding(uint32_t Val) {
+  uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val));
+  if (IntImm != 0)
+    return IntImm;
+
+  if (Val == FloatToBits(0.5f))
+    return 240;
+
+  if (Val == FloatToBits(-0.5f))
+    return 241;
+
+  if (Val == FloatToBits(1.0f))
+    return 242;
+
+  if (Val == FloatToBits(-1.0f))
+    return 243;
+
+  if (Val == FloatToBits(2.0f))
+    return 244;
+
+  if (Val == FloatToBits(-2.0f))
+    return 245;
+
+  if (Val == FloatToBits(4.0f))
+    return 246;
+
+  if (Val == FloatToBits(-4.0f))
+    return 247;
+
+  return 255;
+}
+
+static uint32_t getLit64Encoding(uint64_t Val) {
+  uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val));
+  if (IntImm != 0)
+    return IntImm;
+
+  if (Val == DoubleToBits(0.5))
+    return 240;
+
+  if (Val == DoubleToBits(-0.5))
+    return 241;
+
+  if (Val == DoubleToBits(1.0))
+    return 242;
+
+  if (Val == DoubleToBits(-1.0))
+    return 243;
+
+  if (Val == DoubleToBits(2.0))
+    return 244;
+
+  if (Val == DoubleToBits(-2.0))
+    return 245;
+
+  if (Val == DoubleToBits(4.0))
+    return 246;
+
+  if (Val == DoubleToBits(-4.0))
+    return 247;
+
+  return 255;
+}
+
+uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
+                                         unsigned OpSize) const {
+  if (MO.isExpr())
+    return 255;
+
+  assert(!MO.isFPImm());
+
+  if (!MO.isImm())
+    return ~0;
+
+  if (OpSize == 4)
+    return getLit32Encoding(static_cast<uint32_t>(MO.getImm()));
+
+  assert(OpSize == 8);
+
+  return getLit64Encoding(static_cast<uint64_t>(MO.getImm()));
+}
+
+void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+
+  uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI);
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  unsigned bytes = Desc.getSize();
+
+  for (unsigned i = 0; i < bytes; i++) {
+    OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
+  }
+
+  if (bytes > 4)
+    return;
+
+  // Check for additional literals in SRC0/1/2 (Op 1/2/3)
+  for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
+
+    // Check if this operand should be encoded as [SV]Src
+    if (!isSrcOperand(Desc, i))
+      continue;
+
+    int RCID = Desc.OpInfo[i].RegClass;
+    const MCRegisterClass &RC = MRI.getRegClass(RCID);
+
+    // Is this operand a literal immediate?
+    const MCOperand &Op = MI.getOperand(i);
+    if (getLitEncoding(Op, RC.getSize()) != 255)
+      continue;
+
+    // Yes! Encode it
+    int64_t Imm = 0;
+
+    if (Op.isImm())
+      Imm = Op.getImm();
+    else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
+      llvm_unreachable("Must be immediate or expr");
+
+    for (unsigned j = 0; j < 4; j++) {
+      OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff));
+    }
+
+    // Only one literal value allowed
+    break;
+  }
+}
+
+unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  if (MO.isExpr()) {
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br;
+    Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+    return 0;
+  }
+
+  return getMachineOpValue(MI, MO, Fixups, STI);
+}
+
+uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                            const MCOperand &MO,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return MRI.getEncodingValue(MO.getReg());
+
+  if (MO.isExpr()) {
+    const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr());
+    MCFixupKind Kind;
+    const MCSymbol *Sym =
+        Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
+
+    if (&Expr->getSymbol() == Sym) {
+      // Add the offset to the beginning of the constant values.
+      Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text;
+    } else {
+      // This is used for constant data stored in .rodata.
+     Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
+    }
+    Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc()));
+  }
+
+  // Figure out the operand number, needed for isSrcOperand check
+  unsigned OpNo = 0;
+  for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) {
+    if (&MO == &MI.getOperand(OpNo))
+      break;
+  }
+
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  if (isSrcOperand(Desc, OpNo)) {
+    int RCID = Desc.OpInfo[OpNo].RegClass;
+    const MCRegisterClass &RC = MRI.getRegClass(RCID);
+
+    uint32_t Enc = getLitEncoding(MO, RC.getSize());
+    if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
+      return Enc;
+
+  } else if (MO.isImm())
+    return MO.getImm();
+
+  llvm_unreachable("Encoding of this operand type is not supported yet.");
+  return 0;
+}
+
diff --git a/lib/Target/AMDGPU/Makefile b/lib/Target/AMDGPU/Makefile
new file mode 100644
index 00000000000..64a7c8c045c
--- /dev/null
+++ b/lib/Target/AMDGPU/Makefile
@@ -0,0 +1,23 @@
+##===- lib/Target/R600/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMR600CodeGen
+TARGET = AMDGPU
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \
+		AMDGPUGenDAGISel.inc  AMDGPUGenSubtargetInfo.inc \
+		AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \
+		AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \
+		AMDGPUGenAsmWriter.inc AMDGPUGenAsmMatcher.inc
+
+DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
new file mode 100644
index 00000000000..c0ffede5199
--- /dev/null
+++ b/lib/Target/AMDGPU/Processors.td
@@ -0,0 +1,137 @@
+//===-- Processors.td - R600 Processor definitions ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
+: Processor<Name, itin, Features>;
+
+//===----------------------------------------------------------------------===//
+// R600
+//===----------------------------------------------------------------------===//
+def : Proc<"",           R600_VLIW5_Itin,
+    [FeatureR600, FeatureVertexCache]>;
+
+def : Proc<"r600",       R600_VLIW5_Itin,
+    [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>;
+
+def : Proc<"r630",       R600_VLIW5_Itin,
+    [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>;
+
+def : Proc<"rs880",      R600_VLIW5_Itin,
+    [FeatureR600, FeatureWavefrontSize16]>;
+
+def : Proc<"rv670",      R600_VLIW5_Itin,
+    [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// R700
+//===----------------------------------------------------------------------===//
+
+def : Proc<"rv710",      R600_VLIW5_Itin,
+    [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
+
+def : Proc<"rv730",      R600_VLIW5_Itin,
+    [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
+
+def : Proc<"rv770",      R600_VLIW5_Itin,
+    [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// Evergreen
+//===----------------------------------------------------------------------===//
+
+def : Proc<"cedar",      R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32,
+     FeatureCFALUBug]>;
+
+def : Proc<"redwood",    R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64,
+     FeatureCFALUBug]>;
+
+def : Proc<"sumo",       R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>;
+
+def : Proc<"juniper",    R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+def : Proc<"cypress",    R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureFP64, FeatureVertexCache,
+     FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// Northern Islands
+//===----------------------------------------------------------------------===//
+
+def : Proc<"barts",      R600_VLIW5_Itin,
+    [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
+
+def : Proc<"turks",      R600_VLIW5_Itin,
+    [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
+
+def : Proc<"caicos",     R600_VLIW5_Itin,
+    [FeatureNorthernIslands, FeatureCFALUBug]>;
+
+def : Proc<"cayman",     R600_VLIW4_Itin,
+    [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>;
+
+//===----------------------------------------------------------------------===//
+// Southern Islands
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"SI", SIFullSpeedModel,
+  [FeatureSouthernIslands, FeatureFastFMAF32]
+>;
+
+def : ProcessorModel<"tahiti",   SIFullSpeedModel,
+  [FeatureSouthernIslands, FeatureFastFMAF32]
+>;
+
+def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+
+def : ProcessorModel<"verde",    SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+
+def : ProcessorModel<"oland",    SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+
+def : ProcessorModel<"hainan",   SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+
+//===----------------------------------------------------------------------===//
+// Sea Islands
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"bonaire",    SIQuarterSpeedModel,
+  [FeatureSeaIslands, FeatureLDSBankCount32]
+>;
+
+def : ProcessorModel<"kabini",     SIQuarterSpeedModel,
+  [FeatureSeaIslands, FeatureLDSBankCount16]
+>;
+
+def : ProcessorModel<"kaveri",     SIQuarterSpeedModel,
+  [FeatureSeaIslands, FeatureLDSBankCount32]
+>;
+
+def : ProcessorModel<"hawaii", SIFullSpeedModel,
+  [FeatureSeaIslands, FeatureFastFMAF32, FeatureLDSBankCount32]
+>;
+
+def : ProcessorModel<"mullins",    SIQuarterSpeedModel,
+  [FeatureSeaIslands, FeatureLDSBankCount16]>;
+
+//===----------------------------------------------------------------------===//
+// Volcanic Islands
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"tonga",   SIQuarterSpeedModel,
+  [FeatureVolcanicIslands, FeatureSGPRInitBug]
+>;
+
+def : ProcessorModel<"iceland", SIQuarterSpeedModel,
+  [FeatureVolcanicIslands, FeatureSGPRInitBug]
+>;
+
+def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
new file mode 100644
index 00000000000..3cb90218a7d
--- /dev/null
+++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -0,0 +1,206 @@
+//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer.
+/// This pass is merging consecutive CFAlus where applicable.
+/// It needs to be called after IfCvt for best results.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "r600mergeclause"
+
+namespace {
+
+static bool isCFAlu(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AMDGPU::CF_ALU:
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+    return true;
+  default:
+    return false;
+  }
+}
+
+class R600ClauseMergePass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+
+  unsigned getCFAluSize(const MachineInstr *MI) const;
+  bool isCFAluEnabled(const MachineInstr *MI) const;
+
+  /// IfCvt pass can generate "disabled" ALU clause marker that need to be
+  /// removed and their content affected to the previous alu clause.
+  /// This function parse instructions after CFAlu until it find a disabled
+  /// CFAlu and merge the content, or an enabled CFAlu.
+  void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const;
+
+  /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if
+  /// it is the case.
+  bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu)
+      const;
+
+public:
+  R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override;
+};
+
+char R600ClauseMergePass::ID = 0;
+
+unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const {
+  assert(isCFAlu(MI));
+  return MI->getOperand(
+      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm();
+}
+
+bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const {
+  assert(isCFAlu(MI));
+  return MI->getOperand(
+      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm();
+}
+
+void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu)
+    const {
+  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end();
+  I++;
+  do {
+    while (I!= E && !isCFAlu(I))
+      I++;
+    if (I == E)
+      return;
+    MachineInstr *MI = I++;
+    if (isCFAluEnabled(MI))
+      break;
+    CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI));
+    MI->eraseFromParent();
+  } while (I != E);
+}
+
+bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
+                                          const MachineInstr *LatrCFAlu) const {
+  assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu));
+  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  unsigned RootInstCount = getCFAluSize(RootCFAlu),
+      LaterInstCount = getCFAluSize(LatrCFAlu);
+  unsigned CumuledInsts = RootInstCount + LaterInstCount;
+  if (CumuledInsts >= TII->getMaxAlusPerClause()) {
+    DEBUG(dbgs() << "Excess inst counts\n");
+    return false;
+  }
+  if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+    return false;
+  // Is KCache Bank 0 compatible ?
+  int Mode0Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0);
+  int KBank0Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0);
+  int KBank0LineIdx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0);
+  if (LatrCFAlu->getOperand(Mode0Idx).getImm() &&
+      RootCFAlu->getOperand(Mode0Idx).getImm() &&
+      (LatrCFAlu->getOperand(KBank0Idx).getImm() !=
+       RootCFAlu->getOperand(KBank0Idx).getImm() ||
+      LatrCFAlu->getOperand(KBank0LineIdx).getImm() !=
+      RootCFAlu->getOperand(KBank0LineIdx).getImm())) {
+    DEBUG(dbgs() << "Wrong KC0\n");
+    return false;
+  }
+  // Is KCache Bank 1 compatible ?
+  int Mode1Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1);
+  int KBank1Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1);
+  int KBank1LineIdx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1);
+  if (LatrCFAlu->getOperand(Mode1Idx).getImm() &&
+      RootCFAlu->getOperand(Mode1Idx).getImm() &&
+      (LatrCFAlu->getOperand(KBank1Idx).getImm() !=
+      RootCFAlu->getOperand(KBank1Idx).getImm() ||
+      LatrCFAlu->getOperand(KBank1LineIdx).getImm() !=
+      RootCFAlu->getOperand(KBank1LineIdx).getImm())) {
+    DEBUG(dbgs() << "Wrong KC0\n");
+    return false;
+  }
+  if (LatrCFAlu->getOperand(Mode0Idx).getImm()) {
+    RootCFAlu->getOperand(Mode0Idx).setImm(
+        LatrCFAlu->getOperand(Mode0Idx).getImm());
+    RootCFAlu->getOperand(KBank0Idx).setImm(
+        LatrCFAlu->getOperand(KBank0Idx).getImm());
+    RootCFAlu->getOperand(KBank0LineIdx).setImm(
+        LatrCFAlu->getOperand(KBank0LineIdx).getImm());
+  }
+  if (LatrCFAlu->getOperand(Mode1Idx).getImm()) {
+    RootCFAlu->getOperand(Mode1Idx).setImm(
+        LatrCFAlu->getOperand(Mode1Idx).getImm());
+    RootCFAlu->getOperand(KBank1Idx).setImm(
+        LatrCFAlu->getOperand(KBank1Idx).getImm());
+    RootCFAlu->getOperand(KBank1LineIdx).setImm(
+        LatrCFAlu->getOperand(KBank1LineIdx).getImm());
+  }
+  RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts);
+  RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode()));
+  return true;
+}
+
+bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    MachineBasicBlock::iterator I = MBB.begin(),  E = MBB.end();
+    MachineBasicBlock::iterator LatestCFAlu = E;
+    while (I != E) {
+      MachineInstr *MI = I++;
+      if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) ||
+          TII->mustBeLastInClause(MI->getOpcode()))
+        LatestCFAlu = E;
+      if (!isCFAlu(MI))
+        continue;
+      cleanPotentialDisabledCFAlu(MI);
+
+      if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) {
+        MI->eraseFromParent();
+      } else {
+        assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled");
+        LatestCFAlu = MI;
+      }
+    }
+  }
+  return false;
+}
+
+const char *R600ClauseMergePass::getPassName() const {
+  return "R600 Merge Clause Markers Pass";
+}
+
+} // end anonymous namespace
+
+
+llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) {
+  return new R600ClauseMergePass(TM);
+}
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
new file mode 100644
index 00000000000..c8f37f61fc1
--- /dev/null
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -0,0 +1,679 @@
+//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass compute turns all control flow pseudo instructions into native one
+/// computing their address on the fly ; it also sets STACK_SIZE info.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Debug.h"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "r600cf"
+
+namespace {
+
+struct CFStack {
+
+  enum StackItem {
+    ENTRY = 0,
+    SUB_ENTRY = 1,
+    FIRST_NON_WQM_PUSH = 2,
+    FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
+  };
+
+  const AMDGPUSubtarget *ST;
+  std::vector<StackItem> BranchStack;
+  std::vector<StackItem> LoopStack;
+  unsigned MaxStackSize;
+  unsigned CurrentEntries;
+  unsigned CurrentSubEntries;
+
+  CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st),
+      // We need to reserve a stack entry for CALL_FS in vertex shaders.
+      MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
+      CurrentEntries(0), CurrentSubEntries(0) { }
+
+  unsigned getLoopDepth();
+  bool branchStackContains(CFStack::StackItem);
+  bool requiresWorkAroundForInst(unsigned Opcode);
+  unsigned getSubEntrySize(CFStack::StackItem Item);
+  void updateMaxStackSize();
+  void pushBranch(unsigned Opcode, bool isWQM = false);
+  void pushLoop();
+  void popBranch();
+  void popLoop();
+};
+
+unsigned CFStack::getLoopDepth() {
+  return LoopStack.size();
+}
+
+bool CFStack::branchStackContains(CFStack::StackItem Item) {
+  for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
+       E = BranchStack.end(); I != E; ++I) {
+    if (*I == Item)
+      return true;
+  }
+  return false;
+}
+
+bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
+  if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
+      getLoopDepth() > 1)
+    return true;
+
+  if (!ST->hasCFAluBug())
+    return false;
+
+  switch(Opcode) {
+  default: return false;
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+  case AMDGPU::CF_ALU_ELSE_AFTER:
+  case AMDGPU::CF_ALU_BREAK:
+  case AMDGPU::CF_ALU_CONTINUE:
+    if (CurrentSubEntries == 0)
+      return false;
+    if (ST->getWavefrontSize() == 64) {
+      // We are being conservative here.  We only require this work-around if
+      // CurrentSubEntries > 3 &&
+      // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
+      //
+      // We have to be conservative, because we don't know for certain that
+      // our stack allocation algorithm for Evergreen/NI is correct.  Applying this
+      // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
+      // resources without any problems.
+      return CurrentSubEntries > 3;
+    } else {
+      assert(ST->getWavefrontSize() == 32);
+      // We are being conservative here.  We only require the work-around if
+      // CurrentSubEntries > 7 &&
+      // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
+      // See the comment on the wavefront size == 64 case for why we are
+      // being conservative.
+      return CurrentSubEntries > 7;
+    }
+  }
+}
+
+unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
+  switch(Item) {
+  default:
+    return 0;
+  case CFStack::FIRST_NON_WQM_PUSH:
+  assert(!ST->hasCaymanISA());
+  if (ST->getGeneration() <= AMDGPUSubtarget::R700) {
+    // +1 For the push operation.
+    // +2 Extra space required.
+    return 3;
+  } else {
+    // Some documentation says that this is not necessary on Evergreen,
+    // but experimentation has show that we need to allocate 1 extra
+    // sub-entry for the first non-WQM push.
+    // +1 For the push operation.
+    // +1 Extra space required.
+    return 2;
+  }
+  case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
+    assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+    // +1 For the push operation.
+    // +1 Extra space required.
+    return 2;
+  case CFStack::SUB_ENTRY:
+    return 1;
+  }
+}
+
+void CFStack::updateMaxStackSize() {
+  unsigned CurrentStackSize = CurrentEntries +
+                              (RoundUpToAlignment(CurrentSubEntries, 4) / 4);
+  MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
+}
+
+void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
+  CFStack::StackItem Item = CFStack::ENTRY;
+  switch(Opcode) {
+  case AMDGPU::CF_PUSH_EG:
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+    if (!isWQM) {
+      if (!ST->hasCaymanISA() &&
+          !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
+        Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
+                                             // See comment in
+                                             // CFStack::getSubEntrySize()
+      else if (CurrentEntries > 0 &&
+               ST->getGeneration() > AMDGPUSubtarget::EVERGREEN &&
+               !ST->hasCaymanISA() &&
+               !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
+        Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
+      else
+        Item = CFStack::SUB_ENTRY;
+    } else
+      Item = CFStack::ENTRY;
+    break;
+  }
+  BranchStack.push_back(Item);
+  if (Item == CFStack::ENTRY)
+    CurrentEntries++;
+  else
+    CurrentSubEntries += getSubEntrySize(Item);
+  updateMaxStackSize();
+}
+
+void CFStack::pushLoop() {
+  LoopStack.push_back(CFStack::ENTRY);
+  CurrentEntries++;
+  updateMaxStackSize();
+}
+
+void CFStack::popBranch() {
+  CFStack::StackItem Top = BranchStack.back();
+  if (Top == CFStack::ENTRY)
+    CurrentEntries--;
+  else
+    CurrentSubEntries-= getSubEntrySize(Top);
+  BranchStack.pop_back();
+}
+
+void CFStack::popLoop() {
+  CurrentEntries--;
+  LoopStack.pop_back();
+}
+
+class R600ControlFlowFinalizer : public MachineFunctionPass {
+
+private:
+  typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
+
+  enum ControlFlowInstruction {
+    CF_TC,
+    CF_VC,
+    CF_CALL_FS,
+    CF_WHILE_LOOP,
+    CF_END_LOOP,
+    CF_LOOP_BREAK,
+    CF_LOOP_CONTINUE,
+    CF_JUMP,
+    CF_ELSE,
+    CF_POP,
+    CF_END
+  };
+
+  static char ID;
+  const R600InstrInfo *TII;
+  const R600RegisterInfo *TRI;
+  unsigned MaxFetchInst;
+  const AMDGPUSubtarget *ST;
+
+  bool IsTrivialInst(MachineInstr *MI) const {
+    switch (MI->getOpcode()) {
+    case AMDGPU::KILL:
+    case AMDGPU::RETURN:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
+    unsigned Opcode = 0;
+    bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+    switch (CFI) {
+    case CF_TC:
+      Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
+      break;
+    case CF_VC:
+      Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
+      break;
+    case CF_CALL_FS:
+      Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
+      break;
+    case CF_WHILE_LOOP:
+      Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
+      break;
+    case CF_END_LOOP:
+      Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
+      break;
+    case CF_LOOP_BREAK:
+      Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
+      break;
+    case CF_LOOP_CONTINUE:
+      Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
+      break;
+    case CF_JUMP:
+      Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
+      break;
+    case CF_ELSE:
+      Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
+      break;
+    case CF_POP:
+      Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
+      break;
+    case CF_END:
+      if (ST->hasCaymanISA()) {
+        Opcode = AMDGPU::CF_END_CM;
+        break;
+      }
+      Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
+      break;
+    }
+    assert (Opcode && "No opcode selected");
+    return TII->get(Opcode);
+  }
+
+  bool isCompatibleWithClause(const MachineInstr *MI,
+      std::set<unsigned> &DstRegs) const {
+    unsigned DstMI, SrcMI;
+    for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
+        E = MI->operands_end(); I != E; ++I) {
+      const MachineOperand &MO = *I;
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef()) {
+        unsigned Reg = MO.getReg();
+        if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+          DstMI = Reg;
+        else
+          DstMI = TRI->getMatchingSuperReg(Reg,
+              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+              &AMDGPU::R600_Reg128RegClass);
+      }
+      if (MO.isUse()) {
+        unsigned Reg = MO.getReg();
+        if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+          SrcMI = Reg;
+        else
+          SrcMI = TRI->getMatchingSuperReg(Reg,
+              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+              &AMDGPU::R600_Reg128RegClass);
+      }
+    }
+    if ((DstRegs.find(SrcMI) == DstRegs.end())) {
+      DstRegs.insert(DstMI);
+      return true;
+    } else
+      return false;
+  }
+
+  ClauseFile
+  MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
+      const {
+    MachineBasicBlock::iterator ClauseHead = I;
+    std::vector<MachineInstr *> ClauseContent;
+    unsigned AluInstCount = 0;
+    bool IsTex = TII->usesTextureCache(ClauseHead);
+    std::set<unsigned> DstRegs;
+    for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
+      if (IsTrivialInst(I))
+        continue;
+      if (AluInstCount >= MaxFetchInst)
+        break;
+      if ((IsTex && !TII->usesTextureCache(I)) ||
+          (!IsTex && !TII->usesVertexCache(I)))
+        break;
+      if (!isCompatibleWithClause(I, DstRegs))
+        break;
+      AluInstCount ++;
+      ClauseContent.push_back(I);
+    }
+    MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
+        getHWInstrDesc(IsTex?CF_TC:CF_VC))
+        .addImm(0) // ADDR
+        .addImm(AluInstCount - 1); // COUNT
+    return ClauseFile(MIb, std::move(ClauseContent));
+  }
+
+  void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
+    static const unsigned LiteralRegs[] = {
+      AMDGPU::ALU_LITERAL_X,
+      AMDGPU::ALU_LITERAL_Y,
+      AMDGPU::ALU_LITERAL_Z,
+      AMDGPU::ALU_LITERAL_W
+    };
+    const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs =
+        TII->getSrcs(MI);
+    for (unsigned i = 0, e = Srcs.size(); i < e; ++i) {
+      if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
+        continue;
+      int64_t Imm = Srcs[i].second;
+      std::vector<int64_t>::iterator It =
+          std::find(Lits.begin(), Lits.end(), Imm);
+      if (It != Lits.end()) {
+        unsigned Index = It - Lits.begin();
+        Srcs[i].first->setReg(LiteralRegs[Index]);
+      } else {
+        assert(Lits.size() < 4 && "Too many literals in Instruction Group");
+        Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
+        Lits.push_back(Imm);
+      }
+    }
+  }
+
+  MachineBasicBlock::iterator insertLiterals(
+      MachineBasicBlock::iterator InsertPos,
+      const std::vector<unsigned> &Literals) const {
+    MachineBasicBlock *MBB = InsertPos->getParent();
+    for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
+      unsigned LiteralPair0 = Literals[i];
+      unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
+      InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
+          TII->get(AMDGPU::LITERALS))
+          .addImm(LiteralPair0)
+          .addImm(LiteralPair1);
+    }
+    return InsertPos;
+  }
+
+  ClauseFile
+  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
+      const {
+    MachineBasicBlock::iterator ClauseHead = I;
+    std::vector<MachineInstr *> ClauseContent;
+    I++;
+    for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
+      if (IsTrivialInst(I)) {
+        ++I;
+        continue;
+      }
+      if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
+        break;
+      std::vector<int64_t> Literals;
+      if (I->isBundle()) {
+        MachineInstr *DeleteMI = I;
+        MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
+        while (++BI != E && BI->isBundledWithPred()) {
+          BI->unbundleFromPred();
+          for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
+            MachineOperand &MO = BI->getOperand(i);
+            if (MO.isReg() && MO.isInternalRead())
+              MO.setIsInternalRead(false);
+          }
+          getLiteral(BI, Literals);
+          ClauseContent.push_back(BI);
+        }
+        I = BI;
+        DeleteMI->eraseFromParent();
+      } else {
+        getLiteral(I, Literals);
+        ClauseContent.push_back(I);
+        I++;
+      }
+      for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
+        unsigned literal0 = Literals[i];
+        unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
+        MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(),
+            TII->get(AMDGPU::LITERALS))
+            .addImm(literal0)
+            .addImm(literal2);
+        ClauseContent.push_back(MILit);
+      }
+    }
+    assert(ClauseContent.size() < 128 && "ALU clause is too big");
+    ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
+    return ClauseFile(ClauseHead, std::move(ClauseContent));
+  }
+
+  void
+  EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
+      unsigned &CfCount) {
+    CounterPropagateAddr(Clause.first, CfCount);
+    MachineBasicBlock *BB = Clause.first->getParent();
+    BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
+        .addImm(CfCount);
+    for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
+      BB->splice(InsertPos, BB, Clause.second[i]);
+    }
+    CfCount += 2 * Clause.second.size();
+  }
+
+  void
+  EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
+      unsigned &CfCount) {
+    Clause.first->getOperand(0).setImm(0);
+    CounterPropagateAddr(Clause.first, CfCount);
+    MachineBasicBlock *BB = Clause.first->getParent();
+    BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
+        .addImm(CfCount);
+    for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
+      BB->splice(InsertPos, BB, Clause.second[i]);
+    }
+    CfCount += Clause.second.size();
+  }
+
+  void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
+    MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
+  }
+  void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
+                            unsigned Addr) const {
+    for (MachineInstr *MI : MIs) {
+      CounterPropagateAddr(MI, Addr);
+    }
+  }
+
+public:
+  R600ControlFlowFinalizer(TargetMachine &tm)
+      : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    ST = &MF.getSubtarget<AMDGPUSubtarget>();
+    MaxFetchInst = ST->getTexVTXClauseSize();
+    TII = static_cast<const R600InstrInfo *>(ST->getInstrInfo());
+    TRI = static_cast<const R600RegisterInfo *>(ST->getRegisterInfo());
+    R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+    CFStack CFStack(ST, MFI->getShaderType());
+    for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
+        ++MB) {
+      MachineBasicBlock &MBB = *MB;
+      unsigned CfCount = 0;
+      std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
+      std::vector<MachineInstr * > IfThenElseStack;
+      if (MFI->getShaderType() == ShaderType::VERTEX) {
+        BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
+            getHWInstrDesc(CF_CALL_FS));
+        CfCount++;
+      }
+      std::vector<ClauseFile> FetchClauses, AluClauses;
+      std::vector<MachineInstr *> LastAlu(1);
+      std::vector<MachineInstr *> ToPopAfter;
+      
+      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+          I != E;) {
+        if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
+          DEBUG(dbgs() << CfCount << ":"; I->dump(););
+          FetchClauses.push_back(MakeFetchClause(MBB, I));
+          CfCount++;
+          LastAlu.back() = nullptr;
+          continue;
+        }
+
+        MachineBasicBlock::iterator MI = I;
+        if (MI->getOpcode() != AMDGPU::ENDIF)
+          LastAlu.back() = nullptr;
+        if (MI->getOpcode() == AMDGPU::CF_ALU)
+          LastAlu.back() = MI;
+        I++;
+        bool RequiresWorkAround =
+            CFStack.requiresWorkAroundForInst(MI->getOpcode());
+        switch (MI->getOpcode()) {
+        case AMDGPU::CF_ALU_PUSH_BEFORE:
+          if (RequiresWorkAround) {
+            DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
+            BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
+                .addImm(CfCount + 1)
+                .addImm(1);
+            MI->setDesc(TII->get(AMDGPU::CF_ALU));
+            CfCount++;
+            CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
+          } else
+            CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
+
+        case AMDGPU::CF_ALU:
+          I = MI;
+          AluClauses.push_back(MakeALUClause(MBB, I));
+          DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+          CfCount++;
+          break;
+        case AMDGPU::WHILELOOP: {
+          CFStack.pushLoop();
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_WHILE_LOOP))
+              .addImm(1);
+          std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
+              std::set<MachineInstr *>());
+          Pair.second.insert(MIb);
+          LoopStack.push_back(std::move(Pair));
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::ENDLOOP: {
+          CFStack.popLoop();
+          std::pair<unsigned, std::set<MachineInstr *> > Pair =
+              std::move(LoopStack.back());
+          LoopStack.pop_back();
+          CounterPropagateAddr(Pair.second, CfCount);
+          BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
+              .addImm(Pair.first + 1);
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::IF_PREDICATE_SET: {
+          LastAlu.push_back(nullptr);
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_JUMP))
+              .addImm(0)
+              .addImm(0);
+          IfThenElseStack.push_back(MIb);
+          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::ELSE: {
+          MachineInstr * JumpInst = IfThenElseStack.back();
+          IfThenElseStack.pop_back();
+          CounterPropagateAddr(JumpInst, CfCount);
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_ELSE))
+              .addImm(0)
+              .addImm(0);
+          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+          IfThenElseStack.push_back(MIb);
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::ENDIF: {
+          CFStack.popBranch();
+          if (LastAlu.back()) {
+            ToPopAfter.push_back(LastAlu.back());
+          } else {
+            MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+                getHWInstrDesc(CF_POP))
+                .addImm(CfCount + 1)
+                .addImm(1);
+            (void)MIb;
+            DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+            CfCount++;
+          }
+          
+          MachineInstr *IfOrElseInst = IfThenElseStack.back();
+          IfThenElseStack.pop_back();
+          CounterPropagateAddr(IfOrElseInst, CfCount);
+          IfOrElseInst->getOperand(1).setImm(1);
+          LastAlu.pop_back();
+          MI->eraseFromParent();
+          break;
+        }
+        case AMDGPU::BREAK: {
+          CfCount ++;
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_LOOP_BREAK))
+              .addImm(0);
+          LoopStack.back().second.insert(MIb);
+          MI->eraseFromParent();
+          break;
+        }
+        case AMDGPU::CONTINUE: {
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_LOOP_CONTINUE))
+              .addImm(0);
+          LoopStack.back().second.insert(MIb);
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::RETURN: {
+          BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
+          CfCount++;
+          MI->eraseFromParent();
+          if (CfCount % 2) {
+            BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
+            CfCount++;
+          }
+          for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
+            EmitFetchClause(I, FetchClauses[i], CfCount);
+          for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
+            EmitALUClause(I, AluClauses[i], CfCount);
+        }
+        default:
+          if (TII->isExport(MI->getOpcode())) {
+            DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+            CfCount++;
+          }
+          break;
+        }
+      }
+      for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
+        MachineInstr *Alu = ToPopAfter[i];
+        BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
+            TII->get(AMDGPU::CF_ALU_POP_AFTER))
+            .addImm(Alu->getOperand(0).getImm())
+            .addImm(Alu->getOperand(1).getImm())
+            .addImm(Alu->getOperand(2).getImm())
+            .addImm(Alu->getOperand(3).getImm())
+            .addImm(Alu->getOperand(4).getImm())
+            .addImm(Alu->getOperand(5).getImm())
+            .addImm(Alu->getOperand(6).getImm())
+            .addImm(Alu->getOperand(7).getImm())
+            .addImm(Alu->getOperand(8).getImm());
+        Alu->eraseFromParent();
+      }
+      MFI->StackSize = CFStack.MaxStackSize;
+    }
+
+    return false;
+  }
+
+  const char *getPassName() const override {
+    return "R600 Control Flow Finalizer Pass";
+  }
+};
+
+char R600ControlFlowFinalizer::ID = 0;
+
+} // end anonymous namespace
+
+
+llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
+  return new R600ControlFlowFinalizer(TM);
+}
diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h
new file mode 100644
index 00000000000..51d87eda31d
--- /dev/null
+++ b/lib/Target/AMDGPU/R600Defines.h
@@ -0,0 +1,171 @@
+//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H
+#define LLVM_LIB_TARGET_R600_R600DEFINES_H
+
+#include "llvm/MC/MCRegisterInfo.h"
+
+// Operand Flags
+#define MO_FLAG_CLAMP (1 << 0)
+#define MO_FLAG_NEG   (1 << 1)
+#define MO_FLAG_ABS   (1 << 2)
+#define MO_FLAG_MASK  (1 << 3)
+#define MO_FLAG_PUSH  (1 << 4)
+#define MO_FLAG_NOT_LAST  (1 << 5)
+#define MO_FLAG_LAST  (1 << 6)
+#define NUM_MO_FLAGS 7
+
+/// \brief Helper for getting the operand index for the instruction flags
+/// operand.
+#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3)
+
+namespace R600_InstFlag {
+  enum TIF {
+    TRANS_ONLY = (1 << 0),
+    TEX = (1 << 1),
+    REDUCTION = (1 << 2),
+    FC = (1 << 3),
+    TRIG = (1 << 4),
+    OP3 = (1 << 5),
+    VECTOR = (1 << 6),
+    //FlagOperand bits 7, 8
+    NATIVE_OPERANDS = (1 << 9),
+    OP1 = (1 << 10),
+    OP2 = (1 << 11),
+    VTX_INST  = (1 << 12),
+    TEX_INST = (1 << 13),
+    ALU_INST = (1 << 14),
+    LDS_1A = (1 << 15),
+    LDS_1A1D = (1 << 16),
+    IS_EXPORT = (1 << 17),
+    LDS_1A2D = (1 << 18)
+  };
+}
+
+#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
+
+/// \brief Defines for extracting register information from register encoding
+#define HW_REG_MASK 0x1ff
+#define HW_CHAN_SHIFT 9
+
+#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT)
+#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK)
+
+#define IS_VTX(desc) ((desc).TSFlags & R600_InstFlag::VTX_INST)
+#define IS_TEX(desc) ((desc).TSFlags & R600_InstFlag::TEX_INST)
+
+namespace OpName {
+
+  enum VecOps {
+    UPDATE_EXEC_MASK_X,
+    UPDATE_PREDICATE_X,
+    WRITE_X,
+    OMOD_X,
+    DST_REL_X,
+    CLAMP_X,
+    SRC0_X,
+    SRC0_NEG_X,
+    SRC0_REL_X,
+    SRC0_ABS_X,
+    SRC0_SEL_X,
+    SRC1_X,
+    SRC1_NEG_X,
+    SRC1_REL_X,
+    SRC1_ABS_X,
+    SRC1_SEL_X,
+    PRED_SEL_X,
+    UPDATE_EXEC_MASK_Y,
+    UPDATE_PREDICATE_Y,
+    WRITE_Y,
+    OMOD_Y,
+    DST_REL_Y,
+    CLAMP_Y,
+    SRC0_Y,
+    SRC0_NEG_Y,
+    SRC0_REL_Y,
+    SRC0_ABS_Y,
+    SRC0_SEL_Y,
+    SRC1_Y,
+    SRC1_NEG_Y,
+    SRC1_REL_Y,
+    SRC1_ABS_Y,
+    SRC1_SEL_Y,
+    PRED_SEL_Y,
+    UPDATE_EXEC_MASK_Z,
+    UPDATE_PREDICATE_Z,
+    WRITE_Z,
+    OMOD_Z,
+    DST_REL_Z,
+    CLAMP_Z,
+    SRC0_Z,
+    SRC0_NEG_Z,
+    SRC0_REL_Z,
+    SRC0_ABS_Z,
+    SRC0_SEL_Z,
+    SRC1_Z,
+    SRC1_NEG_Z,
+    SRC1_REL_Z,
+    SRC1_ABS_Z,
+    SRC1_SEL_Z,
+    PRED_SEL_Z,
+    UPDATE_EXEC_MASK_W,
+    UPDATE_PREDICATE_W,
+    WRITE_W,
+    OMOD_W,
+    DST_REL_W,
+    CLAMP_W,
+    SRC0_W,
+    SRC0_NEG_W,
+    SRC0_REL_W,
+    SRC0_ABS_W,
+    SRC0_SEL_W,
+    SRC1_W,
+    SRC1_NEG_W,
+    SRC1_REL_W,
+    SRC1_ABS_W,
+    SRC1_SEL_W,
+    PRED_SEL_W,
+    IMM_0,
+    IMM_1,
+    VEC_COUNT
+ };
+
+}
+
+//===----------------------------------------------------------------------===//
+// Config register definitions
+//===----------------------------------------------------------------------===//
+
+#define R_02880C_DB_SHADER_CONTROL                    0x02880C
+#define   S_02880C_KILL_ENABLE(x)                      (((x) & 0x1) << 6)
+
+// These fields are the same for all shader types and families.
+#define   S_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
+#define   S_STACK_SIZE(x)                       (((x) & 0xFF) << 8)
+//===----------------------------------------------------------------------===//
+// R600, R700 Registers
+//===----------------------------------------------------------------------===//
+
+#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
+#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
+
+//===----------------------------------------------------------------------===//
+// Evergreen, Northern Islands Registers
+//===----------------------------------------------------------------------===//
+
+#define R_028844_SQ_PGM_RESOURCES_PS                 0x028844
+#define R_028860_SQ_PGM_RESOURCES_VS                 0x028860
+#define R_028878_SQ_PGM_RESOURCES_GS                 0x028878
+#define R_0288D4_SQ_PGM_RESOURCES_LS                 0x0288d4
+
+#define R_0288E8_SQ_LDS_ALLOC                        0x0288E8
+
+#endif
diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
new file mode 100644
index 00000000000..fdc20302f4a
--- /dev/null
+++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -0,0 +1,336 @@
+//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold
+/// 128 Alu instructions ; these instructions can access up to 4 prefetched
+/// 4 lines of 16 registers from constant buffers. Such ALU clauses are
+/// initiated by CF_ALU instructions.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+  void initializeR600EmitClauseMarkersPass(PassRegistry&);
+}
+
+namespace {
+
+class R600EmitClauseMarkers : public MachineFunctionPass {
+
+private:
+  const R600InstrInfo *TII;
+  int Address;
+
+  unsigned OccupiedDwords(MachineInstr *MI) const {
+    switch (MI->getOpcode()) {
+    case AMDGPU::INTERP_PAIR_XY:
+    case AMDGPU::INTERP_PAIR_ZW:
+    case AMDGPU::INTERP_VEC_LOAD:
+    case AMDGPU::DOT_4:
+      return 4;
+    case AMDGPU::KILL:
+      return 0;
+    default:
+      break;
+    }
+
+    // These will be expanded to two ALU instructions in the
+    // ExpandSpecialInstructions pass.
+    if (TII->isLDSRetInstr(MI->getOpcode()))
+      return 2;
+
+    if(TII->isVector(*MI) ||
+        TII->isCubeOp(MI->getOpcode()) ||
+        TII->isReductionOp(MI->getOpcode()))
+      return 4;
+
+    unsigned NumLiteral = 0;
+    for (MachineInstr::mop_iterator It = MI->operands_begin(),
+        E = MI->operands_end(); It != E; ++It) {
+      MachineOperand &MO = *It;
+      if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+        ++NumLiteral;
+    }
+    return 1 + NumLiteral;
+  }
+
+  bool isALU(const MachineInstr *MI) const {
+    if (TII->isALUInstr(MI->getOpcode()))
+      return true;
+    if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode()))
+      return true;
+    switch (MI->getOpcode()) {
+    case AMDGPU::PRED_X:
+    case AMDGPU::INTERP_PAIR_XY:
+    case AMDGPU::INTERP_PAIR_ZW:
+    case AMDGPU::INTERP_VEC_LOAD:
+    case AMDGPU::COPY:
+    case AMDGPU::DOT_4:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  bool IsTrivialInst(MachineInstr *MI) const {
+    switch (MI->getOpcode()) {
+    case AMDGPU::KILL:
+    case AMDGPU::RETURN:
+    case AMDGPU::IMPLICIT_DEF:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const {
+    // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2
+    // (See also R600ISelLowering.cpp)
+    // ConstIndex value is in [0, 4095];
+    return std::pair<unsigned, unsigned>(
+        ((Sel >> 2) - 512) >> 12, // KC_BANK
+        // Line Number of ConstIndex
+        // A line contains 16 constant registers however KCX bank can lock
+        // two line at the same time ; thus we want to get an even line number.
+        // Line number can be retrieved with (>>4), using (>>5) <<1 generates
+        // an even number.
+        ((((Sel >> 2) - 512) & 4095) >> 5) << 1);
+  }
+
+  bool SubstituteKCacheBank(MachineInstr *MI,
+      std::vector<std::pair<unsigned, unsigned> > &CachedConsts,
+      bool UpdateInstr = true) const {
+    std::vector<std::pair<unsigned, unsigned> > UsedKCache;
+
+    if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4)
+      return true;
+
+    const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts =
+        TII->getSrcs(MI);
+    assert((TII->isALUInstr(MI->getOpcode()) ||
+        MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const");
+    for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+        continue;
+      unsigned Sel = Consts[i].second;
+      unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
+      unsigned KCacheIndex = Index * 4 + Chan;
+      const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel);
+      if (CachedConsts.empty()) {
+        CachedConsts.push_back(BankLine);
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
+        continue;
+      }
+      if (CachedConsts[0] == BankLine) {
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
+        continue;
+      }
+      if (CachedConsts.size() == 1) {
+        CachedConsts.push_back(BankLine);
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
+        continue;
+      }
+      if (CachedConsts[1] == BankLine) {
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
+        continue;
+      }
+      return false;
+    }
+
+    if (!UpdateInstr)
+      return true;
+
+    for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
+      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+        continue;
+      switch(UsedKCache[j].first) {
+      case 0:
+        Consts[i].first->setReg(
+            AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second));
+        break;
+      case 1:
+        Consts[i].first->setReg(
+            AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second));
+        break;
+      default:
+        llvm_unreachable("Wrong Cache Line");
+      }
+      j++;
+    }
+    return true;
+  }
+
+  bool canClauseLocalKillFitInClause(
+                        unsigned AluInstCount,
+                        std::vector<std::pair<unsigned, unsigned> > KCacheBanks,
+                        MachineBasicBlock::iterator Def,
+                        MachineBasicBlock::iterator BBEnd) {
+    const R600RegisterInfo &TRI = TII->getRegisterInfo();
+    for (MachineInstr::const_mop_iterator
+           MOI = Def->operands_begin(),
+           MOE = Def->operands_end(); MOI != MOE; ++MOI) {
+      if (!MOI->isReg() || !MOI->isDef() ||
+          TRI.isPhysRegLiveAcrossClauses(MOI->getReg()))
+        continue;
+
+      // Def defines a clause local register, so check that its use will fit
+      // in the clause.
+      unsigned LastUseCount = 0;
+      for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) {
+        AluInstCount += OccupiedDwords(UseI);
+        // Make sure we won't need to end the clause due to KCache limitations.
+        if (!SubstituteKCacheBank(UseI, KCacheBanks, false))
+          return false;
+
+        // We have reached the maximum instruction limit before finding the
+        // use that kills this register, so we cannot use this def in the
+        // current clause.
+        if (AluInstCount >= TII->getMaxAlusPerClause())
+          return false;
+
+        // Register kill flags have been cleared by the time we get to this
+        // pass, but it is safe to assume that all uses of this register
+        // occur in the same basic block as its definition, because
+        // it is illegal for the scheduler to schedule them in
+        // different blocks.
+        if (UseI->findRegisterUseOperandIdx(MOI->getReg()))
+          LastUseCount = AluInstCount;
+
+        if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1)
+          break;
+      }
+      if (LastUseCount)
+        return LastUseCount <= TII->getMaxAlusPerClause();
+      llvm_unreachable("Clause local register live at end of clause.");
+    }
+    return true;
+  }
+
+  MachineBasicBlock::iterator
+  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
+    MachineBasicBlock::iterator ClauseHead = I;
+    std::vector<std::pair<unsigned, unsigned> > KCacheBanks;
+    bool PushBeforeModifier = false;
+    unsigned AluInstCount = 0;
+    for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
+      if (IsTrivialInst(I))
+        continue;
+      if (!isALU(I))
+        break;
+      if (AluInstCount > TII->getMaxAlusPerClause())
+        break;
+      if (I->getOpcode() == AMDGPU::PRED_X) {
+        // We put PRED_X in its own clause to ensure that ifcvt won't create
+        // clauses with more than 128 insts.
+        // IfCvt is indeed checking that "then" and "else" branches of an if
+        // statement have less than ~60 insts thus converted clauses can't be
+        // bigger than ~121 insts (predicate setter needs to be in the same
+        // clause as predicated alus).
+        if (AluInstCount > 0)
+          break;
+        if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH)
+          PushBeforeModifier = true;
+        AluInstCount ++;
+        continue;
+      }
+      // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as:
+      //
+      // * KILL or INTERP instructions
+      // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits
+      // * Uses waterfalling (i.e. INDEX_MODE = AR.X)
+      //
+      // XXX: These checks have not been implemented yet.
+      if (TII->mustBeLastInClause(I->getOpcode())) {
+        I++;
+        break;
+      }
+
+      // If this instruction defines a clause local register, make sure
+      // its use can fit in this clause.
+      if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E))
+        break;
+
+      if (!SubstituteKCacheBank(I, KCacheBanks))
+        break;
+      AluInstCount += OccupiedDwords(I);
+    }
+    unsigned Opcode = PushBeforeModifier ?
+        AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
+    BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
+    // We don't use the ADDR field until R600ControlFlowFinalizer pass, where
+    // it is safe to assume it is 0. However if we always put 0 here, the ifcvt
+    // pass may assume that identical ALU clause starter at the beginning of a 
+    // true and false branch can be factorized which is not the case.
+        .addImm(Address++) // ADDR
+        .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0
+        .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1
+        .addImm(KCacheBanks.empty()?0:2) // KM0
+        .addImm((KCacheBanks.size() < 2)?0:2) // KM1
+        .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0
+        .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1
+        .addImm(AluInstCount) // COUNT
+        .addImm(1); // Enabled
+    return I;
+  }
+
+public:
+  static char ID;
+  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) {
+
+    initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+    for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                    BB != BB_E; ++BB) {
+      MachineBasicBlock &MBB = *BB;
+      MachineBasicBlock::iterator I = MBB.begin();
+      if (I->getOpcode() == AMDGPU::CF_ALU)
+        continue; // BB was already parsed
+      for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
+        if (isALU(I))
+          I = MakeALUClause(MBB, I);
+        else
+          ++I;
+      }
+    }
+    return false;
+  }
+
+  const char *getPassName() const override {
+    return "R600 Emit Clause Markers Pass";
+  }
+};
+
+char R600EmitClauseMarkers::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers",
+                      "R600 Emit Clause Markters", false, false)
+INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers",
+                      "R600 Emit Clause Markters", false, false)
+
+llvm::FunctionPass *llvm::createR600EmitClauseMarkers() {
+  return new R600EmitClauseMarkers();
+}
+
diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
new file mode 100644
index 00000000000..211d392e8fc
--- /dev/null
+++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -0,0 +1,349 @@
+//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Vector, Reduction, and Cube instructions need to fill the entire instruction
+/// group to work correctly.  This pass expands these individual instructions
+/// into several instructions that will completely fill the instruction group.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+
+  void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI,
+      unsigned Op);
+
+public:
+  R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
+    TII(nullptr) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "R600 Expand special instructions pass";
+  }
+};
+
+} // End anonymous namespace
+
+char R600ExpandSpecialInstrsPass::ID = 0;
+
+FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
+  return new R600ExpandSpecialInstrsPass(TM);
+}
+
+void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
+    const MachineInstr *OldMI, unsigned Op) {
+  int OpIdx = TII->getOperandIdx(*OldMI, Op);
+  if (OpIdx > -1) {
+    uint64_t Val = OldMI->getOperand(OpIdx).getImm();
+    TII->setImmOperand(NewMI, Op, Val);
+  }
+}
+
+bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  const R600RegisterInfo &TRI = TII->getRegisterInfo();
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    MachineBasicBlock::iterator I = MBB.begin();
+    while (I != MBB.end()) {
+      MachineInstr &MI = *I;
+      I = std::next(I);
+
+      // Expand LDS_*_RET instructions
+      if (TII->isLDSRetInstr(MI.getOpcode())) {
+        int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+        assert(DstIdx != -1);
+        MachineOperand &DstOp = MI.getOperand(DstIdx);
+        MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
+                                               DstOp.getReg(), AMDGPU::OQAP);
+        DstOp.setReg(AMDGPU::OQAP);
+        int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
+                                           AMDGPU::OpName::pred_sel);
+        int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
+                                           AMDGPU::OpName::pred_sel);
+        // Copy the pred_sel bit
+        Mov->getOperand(MovPredSelIdx).setReg(
+            MI.getOperand(LDSPredSelIdx).getReg());
+      }
+
+      switch (MI.getOpcode()) {
+      default: break;
+      // Expand PRED_X to one of the PRED_SET instructions.
+      case AMDGPU::PRED_X: {
+        uint64_t Flags = MI.getOperand(3).getImm();
+        // The native opcode used by PRED_X is stored as an immediate in the
+        // third operand.
+        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
+                                            MI.getOperand(2).getImm(), // opcode
+                                            MI.getOperand(0).getReg(), // dst
+                                            MI.getOperand(1).getReg(), // src0
+                                            AMDGPU::ZERO);             // src1
+        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
+        if (Flags & MO_FLAG_PUSH) {
+          TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1);
+        } else {
+          TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1);
+        }
+        MI.eraseFromParent();
+        continue;
+        }
+
+      case AMDGPU::INTERP_PAIR_XY: {
+        MachineInstr *BMI;
+        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
+                MI.getOperand(2).getImm());
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          unsigned DstReg;
+
+          if (Chan < 2)
+            DstReg = MI.getOperand(Chan).getReg();
+          else
+            DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W;
+
+          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY,
+              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
+
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Chan >= 2)
+            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+        }
+
+        MI.eraseFromParent();
+        continue;
+        }
+
+      case AMDGPU::INTERP_PAIR_ZW: {
+        MachineInstr *BMI;
+        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
+                MI.getOperand(2).getImm());
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          unsigned DstReg;
+
+          if (Chan < 2)
+            DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y;
+          else
+            DstReg = MI.getOperand(Chan-2).getReg();
+
+          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW,
+              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
+
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Chan < 2)
+            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+        }
+
+        MI.eraseFromParent();
+        continue;
+        }
+
+      case AMDGPU::INTERP_VEC_LOAD: {
+        const R600RegisterInfo &TRI = TII->getRegisterInfo();
+        MachineInstr *BMI;
+        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
+                MI.getOperand(1).getImm());
+        unsigned DstReg = MI.getOperand(0).getReg();
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0,
+              TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg);
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+        }
+
+        MI.eraseFromParent();
+        continue;
+        }
+      case AMDGPU::DOT_4: {
+
+        const R600RegisterInfo &TRI = TII->getRegisterInfo();
+
+        unsigned DstReg = MI.getOperand(0).getReg();
+        unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          bool Mask = (Chan != TRI.getHWRegChan(DstReg));
+          unsigned SubDstReg =
+              AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+          MachineInstr *BMI =
+              TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Mask) {
+            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+          }
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+          unsigned Opcode = BMI->getOpcode();
+          // While not strictly necessary from hw point of view, we force
+          // all src operands of a dot4 inst to belong to the same slot.
+          unsigned Src0 = BMI->getOperand(
+              TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
+              .getReg();
+          unsigned Src1 = BMI->getOperand(
+              TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
+              .getReg();
+          (void) Src0;
+          (void) Src1;
+          if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
+              (TRI.getEncodingValue(Src1) & 0xff) < 127)
+            assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
+        }
+        MI.eraseFromParent();
+        continue;
+      }
+      }
+
+      bool IsReduction = TII->isReductionOp(MI.getOpcode());
+      bool IsVector = TII->isVector(MI);
+      bool IsCube = TII->isCubeOp(MI.getOpcode());
+      if (!IsReduction && !IsVector && !IsCube) {
+        continue;
+      }
+
+      // Expand the instruction
+      //
+      // Reduction instructions:
+      // T0_X = DP4 T1_XYZW, T2_XYZW
+      // becomes:
+      // TO_X = DP4 T1_X, T2_X
+      // TO_Y (write masked) = DP4 T1_Y, T2_Y
+      // TO_Z (write masked) = DP4 T1_Z, T2_Z
+      // TO_W (write masked) = DP4 T1_W, T2_W
+      //
+      // Vector instructions:
+      // T0_X = MULLO_INT T1_X, T2_X
+      // becomes:
+      // T0_X = MULLO_INT T1_X, T2_X
+      // T0_Y (write masked) = MULLO_INT T1_X, T2_X
+      // T0_Z (write masked) = MULLO_INT T1_X, T2_X
+      // T0_W (write masked) = MULLO_INT T1_X, T2_X
+      //
+      // Cube instructions:
+      // T0_XYZW = CUBE T1_XYZW
+      // becomes:
+      // TO_X = CUBE T1_Z, T1_Y
+      // T0_Y = CUBE T1_Z, T1_X
+      // T0_Z = CUBE T1_X, T1_Z
+      // T0_W = CUBE T1_Y, T1_Z
+      for (unsigned Chan = 0; Chan < 4; Chan++) {
+        unsigned DstReg = MI.getOperand(
+                            TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg();
+        unsigned Src0 = MI.getOperand(
+                           TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg();
+        unsigned Src1 = 0;
+
+        // Determine the correct source registers
+        if (!IsCube) {
+          int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1);
+          if (Src1Idx != -1) {
+            Src1 = MI.getOperand(Src1Idx).getReg();
+          }
+        }
+        if (IsReduction) {
+          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+          Src0 = TRI.getSubReg(Src0, SubRegIndex);
+          Src1 = TRI.getSubReg(Src1, SubRegIndex);
+        } else if (IsCube) {
+          static const int CubeSrcSwz[] = {2, 2, 0, 1};
+          unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
+          unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
+          Src1 = TRI.getSubReg(Src0, SubRegIndex1);
+          Src0 = TRI.getSubReg(Src0, SubRegIndex0);
+        }
+
+        // Determine the correct destination registers;
+        bool Mask = false;
+        bool NotLast = true;
+        if (IsCube) {
+          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+          DstReg = TRI.getSubReg(DstReg, SubRegIndex);
+        } else {
+          // Mask the write if the original instruction does not write to
+          // the current Channel.
+          Mask = (Chan != TRI.getHWRegChan(DstReg));
+          unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
+          DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+        }
+
+        // Set the IsLast bit
+        NotLast = (Chan != 3 );
+
+        // Add the new instruction
+        unsigned Opcode = MI.getOpcode();
+        switch (Opcode) {
+        case AMDGPU::CUBE_r600_pseudo:
+          Opcode = AMDGPU::CUBE_r600_real;
+          break;
+        case AMDGPU::CUBE_eg_pseudo:
+          Opcode = AMDGPU::CUBE_eg_real;
+          break;
+        default:
+          break;
+        }
+
+        MachineInstr *NewMI =
+          TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);
+
+        if (Chan != 0)
+          NewMI->bundleWithPred();
+        if (Mask) {
+          TII->addFlag(NewMI, 0, MO_FLAG_MASK);
+        }
+        if (NotLast) {
+          TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
+        }
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg);
+      }
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
new file mode 100644
index 00000000000..8357b6d9d0e
--- /dev/null
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -0,0 +1,2286 @@
+//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Custom DAG lowering for R600
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600ISelLowering.h"
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+R600TargetLowering::R600TargetLowering(TargetMachine &TM,
+                                       const AMDGPUSubtarget &STI)
+    : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
+  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
+  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
+  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
+
+  computeRegisterProperties(STI.getRegisterInfo());
+
+  // Set condition code actions
+  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
+
+  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
+
+  setOperationAction(ISD::FCOS, MVT::f32, Custom);
+  setOperationAction(ISD::FSIN, MVT::f32, Custom);
+
+  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
+
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+
+  setOperationAction(ISD::FSUB, MVT::f32, Expand);
+
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+
+  setOperationAction(ISD::SETCC, MVT::i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f32, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+
+  // ADD, SUB overflow.
+  // TODO: turn these into Legal?
+  if (Subtarget->hasCARRY())
+    setOperationAction(ISD::UADDO, MVT::i32, Custom);
+
+  if (Subtarget->hasBORROW())
+    setOperationAction(ISD::USUBO, MVT::i32, Custom);
+
+  // Expand sign extension of vectors
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
+
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
+
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
+
+
+  // Legalize loads and stores to the private address space.
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+
+  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
+  // spaces, so it is custom lowered to handle those where it isn't.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
+
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
+
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
+  }
+
+  setOperationAction(ISD::STORE, MVT::i8, Custom);
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
+
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+  setTargetDAGCombine(ISD::FP_ROUND);
+  setTargetDAGCombine(ISD::FP_TO_SINT);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+
+  // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
+  //  to be Legal/Custom in order to avoid library calls.
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+
+  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+  for (MVT VT : ScalarIntVTs) {
+    setOperationAction(ISD::ADDC, VT, Expand);
+    setOperationAction(ISD::SUBC, VT, Expand);
+    setOperationAction(ISD::ADDE, VT, Expand);
+    setOperationAction(ISD::SUBE, VT, Expand);
+  }
+
+  setSchedulingPreference(Sched::Source);
+}
+
+MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr * MI, MachineBasicBlock * BB) const {
+  MachineFunction * MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineBasicBlock::iterator I = *MI;
+  const R600InstrInfo *TII =
+      static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
+
+  switch (MI->getOpcode()) {
+  default:
+    // Replace LDS_*_RET instruction that don't have any uses with the
+    // equivalent LDS_*_NORET instruction.
+    if (TII->isLDSRetInstr(MI->getOpcode())) {
+      int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+      assert(DstIdx != -1);
+      MachineInstrBuilder NewMI;
+      // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
+      //        LDS_1A2D support and remove this special case.
+      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
+           MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
+        return BB;
+
+      NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
+                      TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
+      for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
+        NewMI.addOperand(MI->getOperand(i));
+      }
+    } else {
+      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+    }
+    break;
+  case AMDGPU::CLAMP_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
+                                                   AMDGPU::MOV,
+                                                   MI->getOperand(0).getReg(),
+                                                   MI->getOperand(1).getReg());
+    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
+    break;
+  }
+
+  case AMDGPU::FABS_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
+                                                    AMDGPU::MOV,
+                                                    MI->getOperand(0).getReg(),
+                                                    MI->getOperand(1).getReg());
+    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
+    break;
+  }
+
+  case AMDGPU::FNEG_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
+                                                    AMDGPU::MOV,
+                                                    MI->getOperand(0).getReg(),
+                                                    MI->getOperand(1).getReg());
+    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
+    break;
+  }
+
+  case AMDGPU::MASK_WRITE: {
+    unsigned maskedRegister = MI->getOperand(0).getReg();
+    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
+    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
+    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
+    break;
+  }
+
+  case AMDGPU::MOV_IMM_F32:
+    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
+                     MI->getOperand(1).getFPImm()->getValueAPF()
+                         .bitcastToAPInt().getZExtValue());
+    break;
+  case AMDGPU::MOV_IMM_I32:
+    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
+                     MI->getOperand(1).getImm());
+    break;
+  case AMDGPU::CONST_COPY: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
+        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
+    TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
+        MI->getOperand(1).getImm());
+    break;
+  }
+
+  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
+  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
+    unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
+
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addImm(EOP); // Set End of program bit
+    break;
+  }
+
+  case AMDGPU::TXD: {
+    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    MachineOperand &RID = MI->getOperand(4);
+    MachineOperand &SID = MI->getOperand(5);
+    unsigned TextureId = MI->getOperand(6).getImm();
+    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
+    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
+
+    switch (TextureId) {
+    case 5: // Rect
+      CTX = CTY = 0;
+      break;
+    case 6: // Shadow1D
+      SrcW = SrcZ;
+      break;
+    case 7: // Shadow2D
+      SrcW = SrcZ;
+      break;
+    case 8: // ShadowRect
+      CTX = CTY = 0;
+      SrcW = SrcZ;
+      break;
+    case 9: // 1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 10: // 2DArray
+      CTZ = 0;
+      break;
+    case 11: // Shadow1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 12: // Shadow2DArray
+      CTZ = 0;
+      break;
+    }
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
+            .addOperand(MI->getOperand(3))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
+            .addOperand(MI->getOperand(2))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW)
+            .addReg(T0, RegState::Implicit)
+            .addReg(T1, RegState::Implicit);
+    break;
+  }
+
+  case AMDGPU::TXD_SHADOW: {
+    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    MachineOperand &RID = MI->getOperand(4);
+    MachineOperand &SID = MI->getOperand(5);
+    unsigned TextureId = MI->getOperand(6).getImm();
+    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
+    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
+
+    switch (TextureId) {
+    case 5: // Rect
+      CTX = CTY = 0;
+      break;
+    case 6: // Shadow1D
+      SrcW = SrcZ;
+      break;
+    case 7: // Shadow2D
+      SrcW = SrcZ;
+      break;
+    case 8: // ShadowRect
+      CTX = CTY = 0;
+      SrcW = SrcZ;
+      break;
+    case 9: // 1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 10: // 2DArray
+      CTZ = 0;
+      break;
+    case 11: // Shadow1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 12: // Shadow2DArray
+      CTZ = 0;
+      break;
+    }
+
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
+            .addOperand(MI->getOperand(3))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
+            .addOperand(MI->getOperand(2))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW)
+            .addReg(T0, RegState::Implicit)
+            .addReg(T1, RegState::Implicit);
+    break;
+  }
+
+  case AMDGPU::BRANCH:
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+              .addOperand(MI->getOperand(0));
+      break;
+
+  case AMDGPU::BRANCH_COND_f32: {
+    MachineInstr *NewMI =
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+              AMDGPU::PREDICATE_BIT)
+              .addOperand(MI->getOperand(1))
+              .addImm(OPCODE_IS_NOT_ZERO)
+              .addImm(0); // Flags
+    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+            .addOperand(MI->getOperand(0))
+            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    break;
+  }
+
+  case AMDGPU::BRANCH_COND_i32: {
+    MachineInstr *NewMI =
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+            AMDGPU::PREDICATE_BIT)
+            .addOperand(MI->getOperand(1))
+            .addImm(OPCODE_IS_NOT_ZERO_INT)
+            .addImm(0); // Flags
+    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+           .addOperand(MI->getOperand(0))
+            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    break;
+  }
+
+  case AMDGPU::EG_ExportSwz:
+  case AMDGPU::R600_ExportSwz: {
+    // Instruction is left unmodified if its not the last one of its type
+    bool isLastInstructionOfItsType = true;
+    unsigned InstExportType = MI->getOperand(1).getImm();
+    for (MachineBasicBlock::iterator NextExportInst = std::next(I),
+         EndBlock = BB->end(); NextExportInst != EndBlock;
+         NextExportInst = std::next(NextExportInst)) {
+      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
+          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
+        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
+            .getImm();
+        if (CurrentInstExportType == InstExportType) {
+          isLastInstructionOfItsType = false;
+          break;
+        }
+      }
+    }
+    bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
+    if (!EOP && !isLastInstructionOfItsType)
+      return BB;
+    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addOperand(MI->getOperand(2))
+            .addOperand(MI->getOperand(3))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6))
+            .addImm(CfInst)
+            .addImm(EOP);
+    break;
+  }
+  case AMDGPU::RETURN: {
+    // RETURN instructions must have the live-out registers as implicit uses,
+    // otherwise they appear dead.
+    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
+    MachineInstrBuilder MIB(*MF, MI);
+    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
+      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
+    return BB;
+  }
+  }
+
+  MI->eraseFromParent();
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Lowering Operations
+//===----------------------------------------------------------------------===//
+
+SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+  switch (Op.getOpcode()) {
+  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
+  case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
+  case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
+  case ISD::FCOS:
+  case ISD::FSIN: return LowerTrig(Op, DAG);
+  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+  case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::LOAD: {
+    SDValue Result = LowerLOAD(Op, DAG);
+    assert((!Result.getNode() ||
+            Result.getNode()->getNumValues() == 2) &&
+           "Load should return a value and a chain");
+    return Result;
+  }
+
+  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
+  case ISD::INTRINSIC_VOID: {
+    SDValue Chain = Op.getOperand(0);
+    unsigned IntrinsicID =
+                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    switch (IntrinsicID) {
+    case AMDGPUIntrinsic::AMDGPU_store_output: {
+      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
+      MFI->LiveOuts.push_back(Reg);
+      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
+    }
+    case AMDGPUIntrinsic::R600_store_swizzle: {
+      SDLoc DL(Op);
+      const SDValue Args[8] = {
+        Chain,
+        Op.getOperand(2), // Export Value
+        Op.getOperand(3), // ArrayBase
+        Op.getOperand(4), // Type
+        DAG.getConstant(0, DL, MVT::i32), // SWZ_X
+        DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
+        DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
+        DAG.getConstant(3, DL, MVT::i32) // SWZ_W
+      };
+      return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
+    }
+
+    // default for switch(IntrinsicID)
+    default: break;
+    }
+    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntrinsicID =
+                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    EVT VT = Op.getValueType();
+    SDLoc DL(Op);
+    switch(IntrinsicID) {
+    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    case AMDGPUIntrinsic::R600_load_input: {
+      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      MRI.addLiveIn(Reg);
+      return DAG.getCopyFromReg(DAG.getEntryNode(),
+          SDLoc(DAG.getEntryNode()), Reg, VT);
+    }
+
+    case AMDGPUIntrinsic::R600_interp_input: {
+      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
+      MachineSDNode *interp;
+      if (ijb < 0) {
+        const R600InstrInfo *TII =
+            static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
+        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
+            MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32));
+        return DAG.getTargetExtractSubreg(
+            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
+            DL, MVT::f32, SDValue(interp, 0));
+      }
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
+      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
+      MRI.addLiveIn(RegisterI);
+      MRI.addLiveIn(RegisterJ);
+      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
+          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
+      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
+          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
+
+      if (slot % 4 < 2)
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
+            RegisterJNode, RegisterINode);
+      else
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
+            RegisterJNode, RegisterINode);
+      return SDValue(interp, slot % 2);
+    }
+    case AMDGPUIntrinsic::R600_interp_xy:
+    case AMDGPUIntrinsic::R600_interp_zw: {
+      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      MachineSDNode *interp;
+      SDValue RegisterINode = Op.getOperand(2);
+      SDValue RegisterJNode = Op.getOperand(3);
+
+      if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
+            RegisterJNode, RegisterINode);
+      else
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
+            RegisterJNode, RegisterINode);
+      return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
+          SDValue(interp, 0), SDValue(interp, 1));
+    }
+    case AMDGPUIntrinsic::R600_tex:
+    case AMDGPUIntrinsic::R600_texc:
+    case AMDGPUIntrinsic::R600_txl:
+    case AMDGPUIntrinsic::R600_txlc:
+    case AMDGPUIntrinsic::R600_txb:
+    case AMDGPUIntrinsic::R600_txbc:
+    case AMDGPUIntrinsic::R600_txf:
+    case AMDGPUIntrinsic::R600_txq:
+    case AMDGPUIntrinsic::R600_ddx:
+    case AMDGPUIntrinsic::R600_ddy:
+    case AMDGPUIntrinsic::R600_ldptr: {
+      unsigned TextureOp;
+      switch (IntrinsicID) {
+      case AMDGPUIntrinsic::R600_tex:
+        TextureOp = 0;
+        break;
+      case AMDGPUIntrinsic::R600_texc:
+        TextureOp = 1;
+        break;
+      case AMDGPUIntrinsic::R600_txl:
+        TextureOp = 2;
+        break;
+      case AMDGPUIntrinsic::R600_txlc:
+        TextureOp = 3;
+        break;
+      case AMDGPUIntrinsic::R600_txb:
+        TextureOp = 4;
+        break;
+      case AMDGPUIntrinsic::R600_txbc:
+        TextureOp = 5;
+        break;
+      case AMDGPUIntrinsic::R600_txf:
+        TextureOp = 6;
+        break;
+      case AMDGPUIntrinsic::R600_txq:
+        TextureOp = 7;
+        break;
+      case AMDGPUIntrinsic::R600_ddx:
+        TextureOp = 8;
+        break;
+      case AMDGPUIntrinsic::R600_ddy:
+        TextureOp = 9;
+        break;
+      case AMDGPUIntrinsic::R600_ldptr:
+        TextureOp = 10;
+        break;
+      default:
+        llvm_unreachable("Unknow Texture Operation");
+      }
+
+      SDValue TexArgs[19] = {
+        DAG.getConstant(TextureOp, DL, MVT::i32),
+        Op.getOperand(1),
+        DAG.getConstant(0, DL, MVT::i32),
+        DAG.getConstant(1, DL, MVT::i32),
+        DAG.getConstant(2, DL, MVT::i32),
+        DAG.getConstant(3, DL, MVT::i32),
+        Op.getOperand(2),
+        Op.getOperand(3),
+        Op.getOperand(4),
+        DAG.getConstant(0, DL, MVT::i32),
+        DAG.getConstant(1, DL, MVT::i32),
+        DAG.getConstant(2, DL, MVT::i32),
+        DAG.getConstant(3, DL, MVT::i32),
+        Op.getOperand(5),
+        Op.getOperand(6),
+        Op.getOperand(7),
+        Op.getOperand(8),
+        Op.getOperand(9),
+        Op.getOperand(10)
+      };
+      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
+    }
+    case AMDGPUIntrinsic::AMDGPU_dp4: {
+      SDValue Args[8] = {
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(0, DL, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(0, DL, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(1, DL, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(1, DL, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(2, DL, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(2, DL, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(3, DL, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(3, DL, MVT::i32))
+      };
+      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
+    }
+
+    case Intrinsic::r600_read_ngroups_x:
+      return LowerImplicitParameter(DAG, VT, DL, 0);
+    case Intrinsic::r600_read_ngroups_y:
+      return LowerImplicitParameter(DAG, VT, DL, 1);
+    case Intrinsic::r600_read_ngroups_z:
+      return LowerImplicitParameter(DAG, VT, DL, 2);
+    case Intrinsic::r600_read_global_size_x:
+      return LowerImplicitParameter(DAG, VT, DL, 3);
+    case Intrinsic::r600_read_global_size_y:
+      return LowerImplicitParameter(DAG, VT, DL, 4);
+    case Intrinsic::r600_read_global_size_z:
+      return LowerImplicitParameter(DAG, VT, DL, 5);
+    case Intrinsic::r600_read_local_size_x:
+      return LowerImplicitParameter(DAG, VT, DL, 6);
+    case Intrinsic::r600_read_local_size_y:
+      return LowerImplicitParameter(DAG, VT, DL, 7);
+    case Intrinsic::r600_read_local_size_z:
+      return LowerImplicitParameter(DAG, VT, DL, 8);
+
+    case Intrinsic::AMDGPU_read_workdim:
+      return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
+
+    case Intrinsic::r600_read_tgid_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_X, VT);
+    case Intrinsic::r600_read_tgid_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_Y, VT);
+    case Intrinsic::r600_read_tgid_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_Z, VT);
+    case Intrinsic::r600_read_tidig_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_X, VT);
+    case Intrinsic::r600_read_tidig_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_Y, VT);
+    case Intrinsic::r600_read_tidig_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_Z, VT);
+    case Intrinsic::AMDGPU_rsq:
+      // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
+      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_fract:
+    case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
+      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
+    }
+    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
+    break;
+  }
+  } // end switch(Op.getOpcode())
+  return SDValue();
+}
+
+void R600TargetLowering::ReplaceNodeResults(SDNode *N,
+                                            SmallVectorImpl<SDValue> &Results,
+                                            SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
+    return;
+  case ISD::FP_TO_UINT:
+    if (N->getValueType(0) == MVT::i1) {
+      Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
+      return;
+    }
+    // Fall-through. Since we don't care about out of bounds values
+    // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
+    // considers some extra cases which are not necessary here.
+  case ISD::FP_TO_SINT: {
+    SDValue Result;
+    if (expandFP_TO_SINT(N, Result, DAG))
+      Results.push_back(Result);
+    return;
+  }
+  case ISD::SDIVREM: {
+    SDValue Op = SDValue(N, 1);
+    SDValue RES = LowerSDIVREM(Op, DAG);
+    Results.push_back(RES);
+    Results.push_back(RES.getValue(1));
+    break;
+  }
+  case ISD::UDIVREM: {
+    SDValue Op = SDValue(N, 0);
+    LowerUDIVREM64(Op, DAG, Results);
+    break;
+  }
+  }
+}
+
+SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
+                                                   SDValue Vector) const {
+
+  SDLoc DL(Vector);
+  EVT VecVT = Vector.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  SmallVector<SDValue, 8> Args;
+
+  for (unsigned i = 0, e = VecVT.getVectorNumElements();
+                                                           i != e; ++i) {
+    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
+                               DAG.getConstant(i, DL, getVectorIdxTy())));
+  }
+
+  return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
+}
+
+SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+
+  SDLoc DL(Op);
+  SDValue Vector = Op.getOperand(0);
+  SDValue Index = Op.getOperand(1);
+
+  if (isa<ConstantSDNode>(Index) ||
+      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+    return Op;
+
+  Vector = vectorToVerticalVector(DAG, Vector);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
+                     Vector, Index);
+}
+
+SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Vector = Op.getOperand(0);
+  SDValue Value = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+
+  if (isa<ConstantSDNode>(Index) ||
+      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+    return Op;
+
+  Vector = vectorToVerticalVector(DAG, Vector);
+  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
+                               Vector, Value, Index);
+  return vectorToVerticalVector(DAG, Insert);
+}
+
+SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
+  // On hw >= R700, COS/SIN input must be between -1. and 1.
+  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
+  EVT VT = Op.getValueType();
+  SDValue Arg = Op.getOperand(0);
+  SDLoc DL(Op);
+  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
+      DAG.getNode(ISD::FADD, DL, VT,
+        DAG.getNode(ISD::FMUL, DL, VT, Arg,
+          DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
+        DAG.getConstantFP(0.5, DL, MVT::f32)));
+  unsigned TrigNode;
+  switch (Op.getOpcode()) {
+  case ISD::FCOS:
+    TrigNode = AMDGPUISD::COS_HW;
+    break;
+  case ISD::FSIN:
+    TrigNode = AMDGPUISD::SIN_HW;
+    break;
+  default:
+    llvm_unreachable("Wrong trig opcode");
+  }
+  SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
+      DAG.getNode(ISD::FADD, DL, VT, FractPart,
+        DAG.getConstantFP(-0.5, DL, MVT::f32)));
+  if (Gen >= AMDGPUSubtarget::R700)
+    return TrigVal;
+  // On R600 hw, COS/SIN input must be between -Pi and Pi.
+  return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
+      DAG.getConstantFP(3.14159265359, DL, MVT::f32));
+}
+
+SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Shift = Op.getOperand(2);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue One  = DAG.getConstant(1, DL, VT);
+
+  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
+  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
+  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
+  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
+
+  // The dance around Width1 is necessary for 0 special case.
+  // Without it the CompShift might be 32, producing incorrect results in
+  // Overflow. So we do the shift in two steps, the alternative is to
+  // add a conditional to filter the special case.
+
+  SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
+  Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
+
+  SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
+  HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
+  SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
+
+  SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
+  SDValue LoBig = Zero;
+
+  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
+  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+}
+
+SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Shift = Op.getOperand(2);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue One  = DAG.getConstant(1, DL, VT);
+
+  const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
+
+  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
+  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
+  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
+  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
+
+  // The dance around Width1 is necessary for 0 special case.
+  // Without it the CompShift might be 32, producing incorrect results in
+  // Overflow. So we do the shift in two steps, the alternative is to
+  // add a conditional to filter the special case.
+
+  SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
+  Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
+
+  SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
+  SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
+  LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
+
+  SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
+  SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
+
+  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
+  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+}
+
+SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
+                                          unsigned mainop, unsigned ovf) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+
+  SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
+  // Extend sign.
+  OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
+                    DAG.getValueType(MVT::i1));
+
+  SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
+}
+
+SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(
+      ISD::SETCC,
+      DL,
+      MVT::i1,
+      Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
+      DAG.getCondCode(ISD::SETNE)
+      );
+}
+
+SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
+                                                   SDLoc DL,
+                                                   unsigned DwordOffset) const {
+  unsigned ByteOffset = DwordOffset * 4;
+  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+                                      AMDGPUAS::CONSTANT_BUFFER_0);
+
+  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
+  assert(isInt<16>(ByteOffset));
+
+  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+                     DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
+                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
+                     false, false, false, 0);
+}
+
+bool R600TargetLowering::isZero(SDValue Op) const {
+  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
+    return Cst->isNullValue();
+  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
+    return CstFP->isZero();
+  } else {
+    return false;
+  }
+}
+
+SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue True = Op.getOperand(2);
+  SDValue False = Op.getOperand(3);
+  SDValue CC = Op.getOperand(4);
+  SDValue Temp;
+
+  if (VT == MVT::f32) {
+    DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
+    SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
+    if (MinMax)
+      return MinMax;
+  }
+
+  // LHS and RHS are guaranteed to be the same value type
+  EVT CompareVT = LHS.getValueType();
+
+  // Check if we can lower this to a native operation.
+
+  // Try to lower to a SET* instruction:
+  //
+  // SET* can match the following patterns:
+  //
+  // select_cc f32, f32, -1,  0, cc_supported
+  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
+  // select_cc i32, i32, -1,  0, cc_supported
+  //
+
+  // Move hardware True/False values to the correct operand.
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+  ISD::CondCode InverseCC =
+     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
+  if (isHWTrueValue(False) && isHWFalseValue(True)) {
+    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
+      std::swap(False, True);
+      CC = DAG.getCondCode(InverseCC);
+    } else {
+      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
+      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
+        std::swap(False, True);
+        std::swap(LHS, RHS);
+        CC = DAG.getCondCode(SwapInvCC);
+      }
+    }
+  }
+
+  if (isHWTrueValue(True) && isHWFalseValue(False) &&
+      (CompareVT == VT || VT == MVT::i32)) {
+    // This can be matched by a SET* instruction.
+    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
+  }
+
+  // Try to lower to a CND* instruction:
+  //
+  // CND* can match the following patterns:
+  //
+  // select_cc f32, 0.0, f32, f32, cc_supported
+  // select_cc f32, 0.0, i32, i32, cc_supported
+  // select_cc i32, 0,   f32, f32, cc_supported
+  // select_cc i32, 0,   i32, i32, cc_supported
+  //
+
+  // Try to move the zero value to the RHS
+  if (isZero(LHS)) {
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    // Try swapping the operands
+    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
+    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
+      std::swap(LHS, RHS);
+      CC = DAG.getCondCode(CCSwapped);
+    } else {
+      // Try inverting the conditon and then swapping the operands
+      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
+      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
+      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
+        std::swap(True, False);
+        std::swap(LHS, RHS);
+        CC = DAG.getCondCode(CCSwapped);
+      }
+    }
+  }
+  if (isZero(RHS)) {
+    SDValue Cond = LHS;
+    SDValue Zero = RHS;
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    if (CompareVT != VT) {
+      // Bitcast True / False to the correct types.  This will end up being
+      // a nop, but it allows us to define only a single pattern in the
+      // .TD files for each CND* instruction rather than having to have
+      // one pattern for integer True/False and one for fp True/False
+      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
+      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
+    }
+
+    switch (CCOpcode) {
+    case ISD::SETONE:
+    case ISD::SETUNE:
+    case ISD::SETNE:
+      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
+      Temp = True;
+      True = False;
+      False = Temp;
+      break;
+    default:
+      break;
+    }
+    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
+        Cond, Zero,
+        True, False,
+        DAG.getCondCode(CCOpcode));
+    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
+  }
+
+  // If we make it this for it means we have no native instructions to handle
+  // this SELECT_CC, so we must lower it.
+  SDValue HWTrue, HWFalse;
+
+  if (CompareVT == MVT::f32) {
+    HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
+    HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
+  } else if (CompareVT == MVT::i32) {
+    HWTrue = DAG.getConstant(-1, DL, CompareVT);
+    HWFalse = DAG.getConstant(0, DL, CompareVT);
+  }
+  else {
+    llvm_unreachable("Unhandled value type in LowerSELECT_CC");
+  }
+
+  // Lower this unsupported SELECT_CC into a combination of two supported
+  // SELECT_CC operations.
+  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
+
+  return DAG.getNode(ISD::SELECT_CC, DL, VT,
+      Cond, HWFalse,
+      True, False,
+      DAG.getCondCode(ISD::SETNE));
+}
+
+/// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
+/// convert these pointers to a register index.  Each register holds
+/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
+/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
+/// for indirect addressing.
+SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
+                                               unsigned StackWidth,
+                                               SelectionDAG &DAG) const {
+  unsigned SRLPad;
+  switch(StackWidth) {
+  case 1:
+    SRLPad = 2;
+    break;
+  case 2:
+    SRLPad = 3;
+    break;
+  case 4:
+    SRLPad = 4;
+    break;
+  default: llvm_unreachable("Invalid stack width");
+  }
+
+  SDLoc DL(Ptr);
+  return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
+                     DAG.getConstant(SRLPad, DL, MVT::i32));
+}
+
+void R600TargetLowering::getStackAddress(unsigned StackWidth,
+                                         unsigned ElemIdx,
+                                         unsigned &Channel,
+                                         unsigned &PtrIncr) const {
+  switch (StackWidth) {
+  default:
+  case 1:
+    Channel = 0;
+    if (ElemIdx > 0) {
+      PtrIncr = 1;
+    } else {
+      PtrIncr = 0;
+    }
+    break;
+  case 2:
+    Channel = ElemIdx % 2;
+    if (ElemIdx == 2) {
+      PtrIncr = 1;
+    } else {
+      PtrIncr = 0;
+    }
+    break;
+  case 4:
+    Channel = ElemIdx;
+    PtrIncr = 0;
+    break;
+  }
+}
+
+SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Value = Op.getOperand(1);
+  SDValue Ptr = Op.getOperand(2);
+
+  SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+  if (Result.getNode()) {
+    return Result;
+  }
+
+  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
+    if (StoreNode->isTruncatingStore()) {
+      EVT VT = Value.getValueType();
+      assert(VT.bitsLE(MVT::i32));
+      EVT MemVT = StoreNode->getMemoryVT();
+      SDValue MaskConstant;
+      if (MemVT == MVT::i8) {
+        MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
+      } else {
+        assert(MemVT == MVT::i16);
+        MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
+      }
+      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
+                                      DAG.getConstant(2, DL, MVT::i32));
+      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
+                                      DAG.getConstant(0x00000003, DL, VT));
+      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
+      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
+                                   DAG.getConstant(3, DL, VT));
+      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
+      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
+      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
+      // vector instead.
+      SDValue Src[4] = {
+        ShiftedValue,
+        DAG.getConstant(0, DL, MVT::i32),
+        DAG.getConstant(0, DL, MVT::i32),
+        Mask
+      };
+      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
+      SDValue Args[3] = { Chain, Input, DWordAddr };
+      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
+                                     Op->getVTList(), Args, MemVT,
+                                     StoreNode->getMemOperand());
+    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
+               Value.getValueType().bitsGE(MVT::i32)) {
+      // Convert pointer from byte address to dword address.
+      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
+                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
+                                    Ptr, DAG.getConstant(2, DL, MVT::i32)));
+
+      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
+        llvm_unreachable("Truncated and indexed stores not supported yet");
+      } else {
+        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
+      }
+      return Chain;
+    }
+  }
+
+  EVT ValueVT = Value.getValueType();
+
+  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+    return SDValue();
+  }
+
+  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+  if (Ret.getNode()) {
+    return Ret;
+  }
+  // Lowering for indirect addressing
+
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL =
+      static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
+  unsigned StackWidth = TFL->getStackWidth(MF);
+
+  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
+
+  if (ValueVT.isVector()) {
+    unsigned NumElemVT = ValueVT.getVectorNumElements();
+    EVT ElemVT = ValueVT.getVectorElementType();
+    SmallVector<SDValue, 4> Stores(NumElemVT);
+
+    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
+                                      "vector width in load");
+
+    for (unsigned i = 0; i < NumElemVT; ++i) {
+      unsigned Channel, PtrIncr;
+      getStackAddress(StackWidth, i, Channel, PtrIncr);
+      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+                        DAG.getConstant(PtrIncr, DL, MVT::i32));
+      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
+                                 Value, DAG.getConstant(i, DL, MVT::i32));
+
+      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                              Chain, Elem, Ptr,
+                              DAG.getTargetConstant(Channel, DL, MVT::i32));
+    }
+     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+   } else {
+    if (ValueVT == MVT::i8) {
+      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
+    }
+    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
+    DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
+  }
+
+  return Chain;
+}
+
+// return (512 + (kc_bank << 12)
+static int
+ConstantAddressBlock(unsigned AddressSpace) {
+  switch (AddressSpace) {
+  case AMDGPUAS::CONSTANT_BUFFER_0:
+    return 512;
+  case AMDGPUAS::CONSTANT_BUFFER_1:
+    return 512 + 4096;
+  case AMDGPUAS::CONSTANT_BUFFER_2:
+    return 512 + 4096 * 2;
+  case AMDGPUAS::CONSTANT_BUFFER_3:
+    return 512 + 4096 * 3;
+  case AMDGPUAS::CONSTANT_BUFFER_4:
+    return 512 + 4096 * 4;
+  case AMDGPUAS::CONSTANT_BUFFER_5:
+    return 512 + 4096 * 5;
+  case AMDGPUAS::CONSTANT_BUFFER_6:
+    return 512 + 4096 * 6;
+  case AMDGPUAS::CONSTANT_BUFFER_7:
+    return 512 + 4096 * 7;
+  case AMDGPUAS::CONSTANT_BUFFER_8:
+    return 512 + 4096 * 8;
+  case AMDGPUAS::CONSTANT_BUFFER_9:
+    return 512 + 4096 * 9;
+  case AMDGPUAS::CONSTANT_BUFFER_10:
+    return 512 + 4096 * 10;
+  case AMDGPUAS::CONSTANT_BUFFER_11:
+    return 512 + 4096 * 11;
+  case AMDGPUAS::CONSTANT_BUFFER_12:
+    return 512 + 4096 * 12;
+  case AMDGPUAS::CONSTANT_BUFFER_13:
+    return 512 + 4096 * 13;
+  case AMDGPUAS::CONSTANT_BUFFER_14:
+    return 512 + 4096 * 14;
+  case AMDGPUAS::CONSTANT_BUFFER_15:
+    return 512 + 4096 * 15;
+  default:
+    return -1;
+  }
+}
+
+SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
+{
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Ptr = Op.getOperand(1);
+  SDValue LoweredLoad;
+
+  SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+  if (Ret.getNode()) {
+    SDValue Ops[2] = {
+      Ret,
+      Chain
+    };
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  // Lower loads constant address space global variable loads
+  if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+      isa<GlobalVariable>(GetUnderlyingObject(
+          LoadNode->getMemOperand()->getValue(), *getDataLayout()))) {
+
+    SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
+        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
+    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
+        DAG.getConstant(2, DL, MVT::i32));
+    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
+                       LoadNode->getChain(), Ptr,
+                       DAG.getTargetConstant(0, DL, MVT::i32),
+                       Op.getOperand(2));
+  }
+
+  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
+    SDValue MergedValues[2] = {
+      ScalarizeVectorLoad(Op, DAG),
+      Chain
+    };
+    return DAG.getMergeValues(MergedValues, DL);
+  }
+
+  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
+  if (ConstantBlock > -1 &&
+      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
+       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
+    SDValue Result;
+    if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
+        isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
+        isa<ConstantSDNode>(Ptr)) {
+      SDValue Slots[4];
+      for (unsigned i = 0; i < 4; i++) {
+        // We want Const position encoded with the following formula :
+        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
+        // const_index is Ptr computed by llvm using an alignment of 16.
+        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
+        // then div by 4 at the ISel step
+        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+            DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
+        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
+      }
+      EVT NewVT = MVT::v4i32;
+      unsigned NumElements = 4;
+      if (VT.isVector()) {
+        NewVT = VT;
+        NumElements = VT.getVectorNumElements();
+      }
+      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
+                           makeArrayRef(Slots, NumElements));
+    } else {
+      // non-constant ptr can't be folded, keeps it as a v4f32 load
+      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
+          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
+                      DAG.getConstant(4, DL, MVT::i32)),
+                      DAG.getConstant(LoadNode->getAddressSpace() -
+                                      AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
+          );
+    }
+
+    if (!VT.isVector()) {
+      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
+                           DAG.getConstant(0, DL, MVT::i32));
+    }
+
+    SDValue MergedValues[2] = {
+      Result,
+      Chain
+    };
+    return DAG.getMergeValues(MergedValues, DL);
+  }
+
+  // For most operations returning SDValue() will result in the node being
+  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
+  // need to manually expand loads that may be legal in some address spaces and
+  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
+  // compute shaders, since the data is sign extended when it is uploaded to the
+  // buffer. However SEXT loads from other address spaces are not supported, so
+  // we need to expand them here.
+  if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
+    EVT MemVT = LoadNode->getMemoryVT();
+    assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
+    SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
+                                  LoadNode->getPointerInfo(), MemVT,
+                                  LoadNode->isVolatile(),
+                                  LoadNode->isNonTemporal(),
+                                  LoadNode->isInvariant(),
+                                  LoadNode->getAlignment());
+    SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
+                              DAG.getValueType(MemVT));
+
+    SDValue MergedValues[2] = { Res, Chain };
+    return DAG.getMergeValues(MergedValues, DL);
+  }
+
+  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+    return SDValue();
+  }
+
+  // Lowering for indirect addressing
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL =
+      static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
+  unsigned StackWidth = TFL->getStackWidth(MF);
+
+  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
+
+  if (VT.isVector()) {
+    unsigned NumElemVT = VT.getVectorNumElements();
+    EVT ElemVT = VT.getVectorElementType();
+    SDValue Loads[4];
+
+    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
+                                      "vector width in load");
+
+    for (unsigned i = 0; i < NumElemVT; ++i) {
+      unsigned Channel, PtrIncr;
+      getStackAddress(StackWidth, i, Channel, PtrIncr);
+      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+                        DAG.getConstant(PtrIncr, DL, MVT::i32));
+      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
+                             Chain, Ptr,
+                             DAG.getTargetConstant(Channel, DL, MVT::i32),
+                             Op.getOperand(2));
+    }
+    for (unsigned i = NumElemVT; i < 4; ++i) {
+      Loads[i] = DAG.getUNDEF(ElemVT);
+    }
+    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
+    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
+  } else {
+    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
+                              Chain, Ptr,
+                              DAG.getTargetConstant(0, DL, MVT::i32), // Channel
+                              Op.getOperand(2));
+  }
+
+  SDValue Ops[2] = {
+    LoweredLoad,
+    Chain
+  };
+
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond  = Op.getOperand(1);
+  SDValue Jump  = Op.getOperand(2);
+
+  return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
+                     Chain, Jump, Cond);
+}
+
+/// XXX Only kernel functions are supported, so we can assume for now that
+/// every function is a kernel function, but in the future we should use
+/// separate calling conventions for kernel and non-kernel functions.
+SDValue R600TargetLowering::LowerFormalArguments(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      SDLoc DL, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  MachineFunction &MF = DAG.getMachineFunction();
+  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+  SmallVector<ISD::InputArg, 8> LocalIns;
+
+  getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
+
+  AnalyzeFormalArguments(CCInfo, LocalIns);
+
+  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    const ISD::InputArg &In = Ins[i];
+    EVT VT = In.VT;
+    EVT MemVT = VA.getLocVT();
+    if (!VT.isVector() && MemVT.isVector()) {
+      // Get load source type if scalarized.
+      MemVT = MemVT.getVectorElementType();
+    }
+
+    if (MFI->getShaderType() != ShaderType::COMPUTE) {
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
+      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+      InVals.push_back(Register);
+      continue;
+    }
+
+    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+                                          AMDGPUAS::CONSTANT_BUFFER_0);
+
+    // i64 isn't a legal type, so the register type used ends up as i32, which
+    // isn't expected here. It attempts to create this sextload, but it ends up
+    // being invalid. Somehow this seems to work with i64 arguments, but breaks
+    // for <1 x i64>.
+
+    // The first 36 bytes of the input buffer contains information about
+    // thread group and global sizes.
+    ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
+    if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
+      // FIXME: This should really check the extload type, but the handling of
+      // extload vector parameters seems to be broken.
+
+      // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+      Ext = ISD::SEXTLOAD;
+    }
+
+    // Compute the offset from the value.
+    // XXX - I think PartOffset should give you this, but it seems to give the
+    // size of the register which isn't useful.
+
+    unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
+    unsigned PartOffset = VA.getLocMemOffset();
+    unsigned Offset = 36 + VA.getLocMemOffset();
+
+    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
+    SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
+                              DAG.getConstant(Offset, DL, MVT::i32),
+                              DAG.getUNDEF(MVT::i32),
+                              PtrInfo,
+                              MemVT, false, true, true, 4);
+
+    // 4 is the preferred alignment for the CONSTANT memory space.
+    InVals.push_back(Arg);
+    MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
+  }
+  return Chain;
+}
+
+EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+   if (!VT.isVector())
+     return MVT::i32;
+   return VT.changeVectorElementTypeToInteger();
+}
+
+static SDValue CompactSwizzlableVector(
+  SelectionDAG &DAG, SDValue VectorEntry,
+  DenseMap<unsigned, unsigned> &RemapSwizzle) {
+  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
+  assert(RemapSwizzle.empty());
+  SDValue NewBldVec[4] = {
+    VectorEntry.getOperand(0),
+    VectorEntry.getOperand(1),
+    VectorEntry.getOperand(2),
+    VectorEntry.getOperand(3)
+  };
+
+  for (unsigned i = 0; i < 4; i++) {
+    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
+      // We mask write here to teach later passes that the ith element of this
+      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
+      // break false dependencies and additionnaly make assembly easier to read.
+      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
+    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
+      if (C->isZero()) {
+        RemapSwizzle[i] = 4; // SEL_0
+        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
+      } else if (C->isExactlyValue(1.0)) {
+        RemapSwizzle[i] = 5; // SEL_1
+        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
+      }
+    }
+
+    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
+      continue;
+    for (unsigned j = 0; j < i; j++) {
+      if (NewBldVec[i] == NewBldVec[j]) {
+        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
+        RemapSwizzle[i] = j;
+        break;
+      }
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
+                     VectorEntry.getValueType(), NewBldVec);
+}
+
+static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
+                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
+  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
+  assert(RemapSwizzle.empty());
+  SDValue NewBldVec[4] = {
+      VectorEntry.getOperand(0),
+      VectorEntry.getOperand(1),
+      VectorEntry.getOperand(2),
+      VectorEntry.getOperand(3)
+  };
+  bool isUnmovable[4] = { false, false, false, false };
+  for (unsigned i = 0; i < 4; i++) {
+    RemapSwizzle[i] = i;
+    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+          ->getZExtValue();
+      if (i == Idx)
+        isUnmovable[Idx] = true;
+    }
+  }
+
+  for (unsigned i = 0; i < 4; i++) {
+    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+          ->getZExtValue();
+      if (isUnmovable[Idx])
+        continue;
+      // Swap i and Idx
+      std::swap(NewBldVec[Idx], NewBldVec[i]);
+      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
+      break;
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
+                     VectorEntry.getValueType(), NewBldVec);
+}
+
+
+SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
+                                            SDValue Swz[4], SelectionDAG &DAG,
+                                            SDLoc DL) const {
+  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
+  // Old -> New swizzle values
+  DenseMap<unsigned, unsigned> SwizzleRemap;
+
+  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
+    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
+      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
+  }
+
+  SwizzleRemap.clear();
+  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
+    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
+      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
+  }
+
+  return BuildVector;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  switch (N->getOpcode()) {
+  default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
+  case ISD::FP_ROUND: {
+      SDValue Arg = N->getOperand(0);
+      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
+        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
+                           Arg.getOperand(0));
+      }
+      break;
+    }
+
+  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
+  // (i32 select_cc f32, f32, -1, 0 cc)
+  //
+  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
+  // this to one of the SET*_DX10 instructions.
+  case ISD::FP_TO_SINT: {
+    SDValue FNeg = N->getOperand(0);
+    if (FNeg.getOpcode() != ISD::FNEG) {
+      return SDValue();
+    }
+    SDValue SelectCC = FNeg.getOperand(0);
+    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
+        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
+        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
+        !isHWTrueValue(SelectCC.getOperand(2)) ||
+        !isHWFalseValue(SelectCC.getOperand(3))) {
+      return SDValue();
+    }
+
+    SDLoc dl(N);
+    return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
+                           SelectCC.getOperand(0), // LHS
+                           SelectCC.getOperand(1), // RHS
+                           DAG.getConstant(-1, dl, MVT::i32), // True
+                           DAG.getConstant(0, dl, MVT::i32),  // False
+                           SelectCC.getOperand(4)); // CC
+
+    break;
+  }
+
+  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
+  // => build_vector elt0, ... , NewEltIdx, ... , eltN
+  case ISD::INSERT_VECTOR_ELT: {
+    SDValue InVec = N->getOperand(0);
+    SDValue InVal = N->getOperand(1);
+    SDValue EltNo = N->getOperand(2);
+    SDLoc dl(N);
+
+    // If the inserted element is an UNDEF, just use the input vector.
+    if (InVal.getOpcode() == ISD::UNDEF)
+      return InVec;
+
+    EVT VT = InVec.getValueType();
+
+    // If we can't generate a legal BUILD_VECTOR, exit
+    if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
+      return SDValue();
+
+    // Check that we know which element is being inserted
+    if (!isa<ConstantSDNode>(EltNo))
+      return SDValue();
+    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+
+    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
+    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
+    // vector elements.
+    SmallVector<SDValue, 8> Ops;
+    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+      Ops.append(InVec.getNode()->op_begin(),
+                 InVec.getNode()->op_end());
+    } else if (InVec.getOpcode() == ISD::UNDEF) {
+      unsigned NElts = VT.getVectorNumElements();
+      Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
+    } else {
+      return SDValue();
+    }
+
+    // Insert the element
+    if (Elt < Ops.size()) {
+      // All the operands of BUILD_VECTOR must have the same type;
+      // we enforce that here.
+      EVT OpVT = Ops[0].getValueType();
+      if (InVal.getValueType() != OpVT)
+        InVal = OpVT.bitsGT(InVal.getValueType()) ?
+          DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
+          DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
+      Ops[Elt] = InVal;
+    }
+
+    // Return the new vector
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  }
+
+  // Extract_vec (Build_vector) generated by custom lowering
+  // also needs to be customly combined
+  case ISD::EXTRACT_VECTOR_ELT: {
+    SDValue Arg = N->getOperand(0);
+    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
+      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+        unsigned Element = Const->getZExtValue();
+        return Arg->getOperand(Element);
+      }
+    }
+    if (Arg.getOpcode() == ISD::BITCAST &&
+        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+        unsigned Element = Const->getZExtValue();
+        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
+            Arg->getOperand(0).getOperand(Element));
+      }
+    }
+  }
+
+  case ISD::SELECT_CC: {
+    // Try common optimizations
+    SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+    if (Ret.getNode())
+      return Ret;
+
+    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
+    //      selectcc x, y, a, b, inv(cc)
+    //
+    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
+    //      selectcc x, y, a, b, cc
+    SDValue LHS = N->getOperand(0);
+    if (LHS.getOpcode() != ISD::SELECT_CC) {
+      return SDValue();
+    }
+
+    SDValue RHS = N->getOperand(1);
+    SDValue True = N->getOperand(2);
+    SDValue False = N->getOperand(3);
+    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+    if (LHS.getOperand(2).getNode() != True.getNode() ||
+        LHS.getOperand(3).getNode() != False.getNode() ||
+        RHS.getNode() != False.getNode()) {
+      return SDValue();
+    }
+
+    switch (NCC) {
+    default: return SDValue();
+    case ISD::SETNE: return LHS;
+    case ISD::SETEQ: {
+      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
+      LHSCC = ISD::getSetCCInverse(LHSCC,
+                                  LHS.getOperand(0).getValueType().isInteger());
+      if (DCI.isBeforeLegalizeOps() ||
+          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
+        return DAG.getSelectCC(SDLoc(N),
+                               LHS.getOperand(0),
+                               LHS.getOperand(1),
+                               LHS.getOperand(2),
+                               LHS.getOperand(3),
+                               LHSCC);
+      break;
+    }
+    }
+    return SDValue();
+  }
+
+  case AMDGPUISD::EXPORT: {
+    SDValue Arg = N->getOperand(1);
+    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
+      break;
+
+    SDValue NewArgs[8] = {
+      N->getOperand(0), // Chain
+      SDValue(),
+      N->getOperand(2), // ArrayBase
+      N->getOperand(3), // Type
+      N->getOperand(4), // SWZ_X
+      N->getOperand(5), // SWZ_Y
+      N->getOperand(6), // SWZ_Z
+      N->getOperand(7) // SWZ_W
+    };
+    SDLoc DL(N);
+    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
+    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
+  }
+  case AMDGPUISD::TEXTURE_FETCH: {
+    SDValue Arg = N->getOperand(1);
+    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
+      break;
+
+    SDValue NewArgs[19] = {
+      N->getOperand(0),
+      N->getOperand(1),
+      N->getOperand(2),
+      N->getOperand(3),
+      N->getOperand(4),
+      N->getOperand(5),
+      N->getOperand(6),
+      N->getOperand(7),
+      N->getOperand(8),
+      N->getOperand(9),
+      N->getOperand(10),
+      N->getOperand(11),
+      N->getOperand(12),
+      N->getOperand(13),
+      N->getOperand(14),
+      N->getOperand(15),
+      N->getOperand(16),
+      N->getOperand(17),
+      N->getOperand(18),
+    };
+    SDLoc DL(N);
+    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
+    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
+  }
+  }
+
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+}
+
+static bool
+FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
+            SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
+  const R600InstrInfo *TII =
+      static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
+  if (!Src.isMachineOpcode())
+    return false;
+  switch (Src.getMachineOpcode()) {
+  case AMDGPU::FNEG_R600:
+    if (!Neg.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
+    return true;
+  case AMDGPU::FABS_R600:
+    if (!Abs.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
+    return true;
+  case AMDGPU::CONST_COPY: {
+    unsigned Opcode = ParentNode->getMachineOpcode();
+    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+
+    if (!Sel.getNode())
+      return false;
+
+    SDValue CstOffset = Src.getOperand(0);
+    if (ParentNode->getValueType(0).isVector())
+      return false;
+
+    // Gather constants values
+    int SrcIndices[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+    };
+    std::vector<unsigned> Consts;
+    for (int OtherSrcIdx : SrcIndices) {
+      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
+      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
+        continue;
+      if (HasDst) {
+        OtherSrcIdx--;
+        OtherSelIdx--;
+      }
+      if (RegisterSDNode *Reg =
+          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
+        if (Reg->getReg() == AMDGPU::ALU_CONST) {
+          ConstantSDNode *Cst
+            = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
+          Consts.push_back(Cst->getZExtValue());
+        }
+      }
+    }
+
+    ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
+    Consts.push_back(Cst->getZExtValue());
+    if (!TII->fitsConstReadLimitations(Consts)) {
+      return false;
+    }
+
+    Sel = CstOffset;
+    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
+    return true;
+  }
+  case AMDGPU::MOV_IMM_I32:
+  case AMDGPU::MOV_IMM_F32: {
+    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+    uint64_t ImmValue = 0;
+
+
+    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
+      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
+      float FloatValue = FPC->getValueAPF().convertToFloat();
+      if (FloatValue == 0.0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (FloatValue == 0.5) {
+        ImmReg = AMDGPU::HALF;
+      } else if (FloatValue == 1.0) {
+        ImmReg = AMDGPU::ONE;
+      } else {
+        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
+      }
+    } else {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
+      uint64_t Value = C->getZExtValue();
+      if (Value == 0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (Value == 1) {
+        ImmReg = AMDGPU::ONE_INT;
+      } else {
+        ImmValue = Value;
+      }
+    }
+
+    // Check that we aren't already using an immediate.
+    // XXX: It's possible for an instruction to have more than one
+    // immediate operand, but this is not supported yet.
+    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+      if (!Imm.getNode())
+        return false;
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
+      assert(C);
+      if (C->getZExtValue())
+        return false;
+      Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
+    }
+    Src = DAG.getRegister(ImmReg, MVT::i32);
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+
+/// \brief Fold the instructions after selecting them
+SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
+                                            SelectionDAG &DAG) const {
+  const R600InstrInfo *TII =
+      static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
+  if (!Node->isMachineOpcode())
+    return Node;
+  unsigned Opcode = Node->getMachineOpcode();
+  SDValue FakeOp;
+
+  std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
+
+  if (Opcode == AMDGPU::DOT_4) {
+    int OperandIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+        };
+    int NegIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
+    };
+    int AbsIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
+    };
+    for (unsigned i = 0; i < 8; i++) {
+      if (OperandIdx[i] < 0)
+        return Node;
+      SDValue &Src = Ops[OperandIdx[i] - 1];
+      SDValue &Neg = Ops[NegIdx[i] - 1];
+      SDValue &Abs = Ops[AbsIdx[i] - 1];
+      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
+      if (HasDst)
+        SelIdx--;
+      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
+      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
+    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
+      SDValue &Src = Ops[i];
+      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  } else if (Opcode == AMDGPU::CLAMP_R600) {
+    SDValue Src = Node->getOperand(0);
+    if (!Src.isMachineOpcode() ||
+        !TII->hasInstrModifiers(Src.getMachineOpcode()))
+      return Node;
+    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
+        AMDGPU::OpName::clamp);
+    if (ClampIdx < 0)
+      return Node;
+    SDLoc DL(Node);
+    std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
+    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
+    return DAG.getMachineNode(Src.getMachineOpcode(), DL,
+                              Node->getVTList(), Ops);
+  } else {
+    if (!TII->hasInstrModifiers(Opcode))
+      return Node;
+    int OperandIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
+    };
+    int NegIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
+    };
+    int AbsIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
+      -1
+    };
+    for (unsigned i = 0; i < 3; i++) {
+      if (OperandIdx[i] < 0)
+        return Node;
+      SDValue &Src = Ops[OperandIdx[i] - 1];
+      SDValue &Neg = Ops[NegIdx[i] - 1];
+      SDValue FakeAbs;
+      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
+      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
+      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
+      if (HasDst) {
+        SelIdx--;
+        ImmIdx--;
+      }
+      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
+      SDValue &Imm = Ops[ImmIdx];
+      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  }
+
+  return Node;
+}
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
new file mode 100644
index 00000000000..c06d3c4fd30
--- /dev/null
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -0,0 +1,80 @@
+//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 DAG Lowering interface definition
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H
+#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H
+
+#include "AMDGPUISelLowering.h"
+
+namespace llvm {
+
+class R600InstrInfo;
+
+class R600TargetLowering : public AMDGPUTargetLowering {
+public:
+  R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
+  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
+      MachineBasicBlock * BB) const override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  void ReplaceNodeResults(SDNode * N,
+                          SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+  SDValue LowerFormalArguments(
+                              SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              SDLoc DL, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals) const override;
+  EVT getSetCCResultType(LLVMContext &, EVT VT) const override;
+private:
+  unsigned Gen;
+  /// Each OpenCL kernel has nine implicit parameters that are stored in the
+  /// first nine dwords of a Vertex Buffer.  These implicit parameters are
+  /// lowered to load instructions which retrieve the values from the Vertex
+  /// Buffer.
+  SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
+                                 SDLoc DL, unsigned DwordOffset) const;
+
+  void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
+      MachineRegisterInfo & MRI, unsigned dword_offset) const;
+  SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG,
+                          SDLoc DL) const;
+  SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
+
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
+                        unsigned mainop, unsigned ovf) const;
+
+  SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
+                                          SelectionDAG &DAG) const;
+  void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
+                       unsigned &Channel, unsigned &PtrIncr) const;
+  bool isZero(SDValue Op) const;
+  SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+};
+
+} // End namespace llvm;
+
+#endif
diff --git a/lib/Target/AMDGPU/R600InstrFormats.td b/lib/Target/AMDGPU/R600InstrFormats.td
new file mode 100644
index 00000000000..0ffd485476e
--- /dev/null
+++ b/lib/Target/AMDGPU/R600InstrFormats.td
@@ -0,0 +1,495 @@
+//===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
+                InstrItinClass itin>
+    : AMDGPUInst <outs, ins, asm, pattern> {
+
+  field bits<64> Inst;
+  bit Trig = 0;
+  bit Op3 = 0;
+  bit isVector = 0;
+  bits<2> FlagOperandIdx = 0;
+  bit Op1 = 0;
+  bit Op2 = 0;
+  bit LDS_1A = 0;
+  bit LDS_1A1D = 0;
+  bit HasNativeOperands = 0;
+  bit VTXInst = 0;
+  bit TEXInst = 0;
+  bit ALUInst = 0;
+  bit IsExport = 0;
+  bit LDS_1A2D = 0;
+
+  let Namespace = "AMDGPU";
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let Itinerary = itin;
+
+  // No AsmMatcher support.
+  let isCodeGenOnly = 1;
+
+  let TSFlags{4} = Trig;
+  let TSFlags{5} = Op3;
+
+  // Vector instructions are instructions that must fill all slots in an
+  // instruction group
+  let TSFlags{6} = isVector;
+  let TSFlags{8-7} = FlagOperandIdx;
+  let TSFlags{9} = HasNativeOperands;
+  let TSFlags{10} = Op1;
+  let TSFlags{11} = Op2;
+  let TSFlags{12} = VTXInst;
+  let TSFlags{13} = TEXInst;
+  let TSFlags{14} = ALUInst;
+  let TSFlags{15} = LDS_1A;
+  let TSFlags{16} = LDS_1A1D;
+  let TSFlags{17} = IsExport;
+  let TSFlags{18} = LDS_1A2D;
+}
+
+//===----------------------------------------------------------------------===//
+// ALU instructions
+//===----------------------------------------------------------------------===//
+
+class R600_ALU_LDS_Word0 {
+  field bits<32> Word0;
+
+  bits<11> src0;
+  bits<1>  src0_rel;
+  bits<11> src1;
+  bits<1>  src1_rel;
+  bits<3>  index_mode = 0;
+  bits<2>  pred_sel;
+  bits<1>  last;
+
+  bits<9>  src0_sel  = src0{8-0};
+  bits<2>  src0_chan = src0{10-9};
+  bits<9>  src1_sel  = src1{8-0};
+  bits<2>  src1_chan = src1{10-9};
+
+  let Word0{8-0}   = src0_sel;
+  let Word0{9}     = src0_rel;
+  let Word0{11-10} = src0_chan;
+  let Word0{21-13} = src1_sel;
+  let Word0{22}    = src1_rel;
+  let Word0{24-23} = src1_chan;
+  let Word0{28-26} = index_mode;
+  let Word0{30-29} = pred_sel;
+  let Word0{31}    = last;
+}
+
+class R600ALU_Word0 : R600_ALU_LDS_Word0 {
+
+  bits<1>  src0_neg;
+  bits<1>  src1_neg;
+
+  let Word0{12}    = src0_neg;
+  let Word0{25}    = src1_neg;
+}
+
+class R600ALU_Word1 {
+  field bits<32> Word1;
+
+  bits<11> dst;
+  bits<3>  bank_swizzle;
+  bits<1>  dst_rel;
+  bits<1>  clamp;
+
+  bits<7>  dst_sel  = dst{6-0};
+  bits<2>  dst_chan = dst{10-9};
+
+  let Word1{20-18} = bank_swizzle;
+  let Word1{27-21} = dst_sel;
+  let Word1{28}    = dst_rel;
+  let Word1{30-29} = dst_chan;
+  let Word1{31}    = clamp;
+}
+
+class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{
+
+  bits<1>  src0_abs;
+  bits<1>  src1_abs;
+  bits<1>  update_exec_mask;
+  bits<1>  update_pred;
+  bits<1>  write;
+  bits<2>  omod;
+
+  let Word1{0}     = src0_abs;
+  let Word1{1}     = src1_abs;
+  let Word1{2}     = update_exec_mask;
+  let Word1{3}     = update_pred;
+  let Word1{4}     = write;
+  let Word1{6-5}   = omod;
+  let Word1{17-7}  = alu_inst;
+}
+
+class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
+
+  bits<11> src2;
+  bits<1>  src2_rel;
+  bits<1>  src2_neg;
+
+  bits<9>  src2_sel = src2{8-0};
+  bits<2>  src2_chan = src2{10-9};
+
+  let Word1{8-0}   = src2_sel;
+  let Word1{9}     = src2_rel;
+  let Word1{11-10} = src2_chan;
+  let Word1{12}    = src2_neg;
+  let Word1{17-13} = alu_inst;
+}
+
+class R600LDS_Word1 {
+  field bits<32> Word1;
+
+  bits<11> src2;
+  bits<9>  src2_sel  = src2{8-0};
+  bits<2>  src2_chan = src2{10-9};
+  bits<1>  src2_rel;
+  // offset specifies the stride offset to the second set of data to be read
+  // from.  This is a dword offset.
+  bits<5>  alu_inst = 17; // OP3_INST_LDS_IDX_OP
+  bits<3>  bank_swizzle;
+  bits<6>  lds_op;
+  bits<2>  dst_chan = 0;
+
+  let Word1{8-0}   = src2_sel;
+  let Word1{9}     = src2_rel;
+  let Word1{11-10} = src2_chan;
+  let Word1{17-13} = alu_inst;
+  let Word1{20-18} = bank_swizzle;
+  let Word1{26-21} = lds_op;
+  let Word1{30-29} = dst_chan;
+}
+
+
+/*
+XXX: R600 subtarget uses a slightly different encoding than the other
+subtargets.  We currently handle this in R600MCCodeEmitter, but we may
+want to use these instruction classes in the future.
+
+class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 {
+
+  bits<1>  fog_merge;
+  bits<10> alu_inst;
+
+  let Inst{37}    = fog_merge;
+  let Inst{39-38} = omod;
+  let Inst{49-40} = alu_inst;
+}
+
+class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 {
+
+  bits<11> alu_inst;
+
+  let Inst{38-37} = omod;
+  let Inst{49-39} = alu_inst;
+}
+*/
+
+//===----------------------------------------------------------------------===//
+// Vertex Fetch instructions
+//===----------------------------------------------------------------------===//
+
+class VTX_WORD0 {
+  field bits<32> Word0;
+  bits<7> src_gpr;
+  bits<5> VC_INST;
+  bits<2> FETCH_TYPE;
+  bits<1> FETCH_WHOLE_QUAD;
+  bits<8> BUFFER_ID;
+  bits<1> SRC_REL;
+  bits<2> SRC_SEL_X;
+
+  let Word0{4-0}   = VC_INST;
+  let Word0{6-5}   = FETCH_TYPE;
+  let Word0{7}     = FETCH_WHOLE_QUAD;
+  let Word0{15-8}  = BUFFER_ID;
+  let Word0{22-16} = src_gpr;
+  let Word0{23}    = SRC_REL;
+  let Word0{25-24} = SRC_SEL_X;
+}
+
+class VTX_WORD0_eg : VTX_WORD0 {
+
+  bits<6> MEGA_FETCH_COUNT;
+
+  let Word0{31-26} = MEGA_FETCH_COUNT;
+}
+
+class VTX_WORD0_cm : VTX_WORD0 {
+
+  bits<2> SRC_SEL_Y;
+  bits<2> STRUCTURED_READ;
+  bits<1> LDS_REQ;
+  bits<1> COALESCED_READ;
+
+  let Word0{27-26} = SRC_SEL_Y;
+  let Word0{29-28} = STRUCTURED_READ;
+  let Word0{30}    = LDS_REQ;
+  let Word0{31}    = COALESCED_READ;
+}
+
+class VTX_WORD1_GPR {
+  field bits<32> Word1;
+  bits<7> dst_gpr;
+  bits<1> DST_REL;
+  bits<3> DST_SEL_X;
+  bits<3> DST_SEL_Y;
+  bits<3> DST_SEL_Z;
+  bits<3> DST_SEL_W;
+  bits<1> USE_CONST_FIELDS;
+  bits<6> DATA_FORMAT;
+  bits<2> NUM_FORMAT_ALL;
+  bits<1> FORMAT_COMP_ALL;
+  bits<1> SRF_MODE_ALL;
+
+  let Word1{6-0} = dst_gpr;
+  let Word1{7}    = DST_REL;
+  let Word1{8}    = 0; // Reserved
+  let Word1{11-9} = DST_SEL_X;
+  let Word1{14-12} = DST_SEL_Y;
+  let Word1{17-15} = DST_SEL_Z;
+  let Word1{20-18} = DST_SEL_W;
+  let Word1{21}    = USE_CONST_FIELDS;
+  let Word1{27-22} = DATA_FORMAT;
+  let Word1{29-28} = NUM_FORMAT_ALL;
+  let Word1{30}    = FORMAT_COMP_ALL;
+  let Word1{31}    = SRF_MODE_ALL;
+}
+
+//===----------------------------------------------------------------------===//
+// Texture fetch instructions
+//===----------------------------------------------------------------------===//
+
+class TEX_WORD0 {
+  field bits<32> Word0;
+
+  bits<5> TEX_INST;
+  bits<2> INST_MOD;
+  bits<1> FETCH_WHOLE_QUAD;
+  bits<8> RESOURCE_ID;
+  bits<7> SRC_GPR;
+  bits<1> SRC_REL;
+  bits<1> ALT_CONST;
+  bits<2> RESOURCE_INDEX_MODE;
+  bits<2> SAMPLER_INDEX_MODE;
+
+  let Word0{4-0} = TEX_INST;
+  let Word0{6-5} = INST_MOD;
+  let Word0{7} = FETCH_WHOLE_QUAD;
+  let Word0{15-8} = RESOURCE_ID;
+  let Word0{22-16} = SRC_GPR;
+  let Word0{23} = SRC_REL;
+  let Word0{24} = ALT_CONST;
+  let Word0{26-25} = RESOURCE_INDEX_MODE;
+  let Word0{28-27} = SAMPLER_INDEX_MODE;
+}
+
+class TEX_WORD1 {
+  field bits<32> Word1;
+
+  bits<7> DST_GPR;
+  bits<1> DST_REL;
+  bits<3> DST_SEL_X;
+  bits<3> DST_SEL_Y;
+  bits<3> DST_SEL_Z;
+  bits<3> DST_SEL_W;
+  bits<7> LOD_BIAS;
+  bits<1> COORD_TYPE_X;
+  bits<1> COORD_TYPE_Y;
+  bits<1> COORD_TYPE_Z;
+  bits<1> COORD_TYPE_W;
+
+  let Word1{6-0} = DST_GPR;
+  let Word1{7} = DST_REL;
+  let Word1{11-9} = DST_SEL_X;
+  let Word1{14-12} = DST_SEL_Y;
+  let Word1{17-15} = DST_SEL_Z;
+  let Word1{20-18} = DST_SEL_W;
+  let Word1{27-21} = LOD_BIAS;
+  let Word1{28} = COORD_TYPE_X;
+  let Word1{29} = COORD_TYPE_Y;
+  let Word1{30} = COORD_TYPE_Z;
+  let Word1{31} = COORD_TYPE_W;
+}
+
+class TEX_WORD2 {
+  field bits<32> Word2;
+
+  bits<5> OFFSET_X;
+  bits<5> OFFSET_Y;
+  bits<5> OFFSET_Z;
+  bits<5> SAMPLER_ID;
+  bits<3> SRC_SEL_X;
+  bits<3> SRC_SEL_Y;
+  bits<3> SRC_SEL_Z;
+  bits<3> SRC_SEL_W;
+
+  let Word2{4-0} = OFFSET_X;
+  let Word2{9-5} = OFFSET_Y;
+  let Word2{14-10} = OFFSET_Z;
+  let Word2{19-15} = SAMPLER_ID;
+  let Word2{22-20} = SRC_SEL_X;
+  let Word2{25-23} = SRC_SEL_Y;
+  let Word2{28-26} = SRC_SEL_Z;
+  let Word2{31-29} = SRC_SEL_W;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions
+//===----------------------------------------------------------------------===//
+
+class CF_WORD1_R600 {
+  field bits<32> Word1;
+
+  bits<3> POP_COUNT;
+  bits<5> CF_CONST;
+  bits<2> COND;
+  bits<3> COUNT;
+  bits<6> CALL_COUNT;
+  bits<1> COUNT_3;
+  bits<1> END_OF_PROGRAM;
+  bits<1> VALID_PIXEL_MODE;
+  bits<7> CF_INST;
+  bits<1> WHOLE_QUAD_MODE;
+  bits<1> BARRIER;
+
+  let Word1{2-0} = POP_COUNT;
+  let Word1{7-3} = CF_CONST;
+  let Word1{9-8} = COND;
+  let Word1{12-10} = COUNT;
+  let Word1{18-13} = CALL_COUNT;
+  let Word1{19} = COUNT_3;
+  let Word1{21} = END_OF_PROGRAM;
+  let Word1{22} = VALID_PIXEL_MODE;
+  let Word1{29-23} = CF_INST;
+  let Word1{30} = WHOLE_QUAD_MODE;
+  let Word1{31} = BARRIER;
+}
+
+class CF_WORD0_EG {
+  field bits<32> Word0;
+
+  bits<24> ADDR;
+  bits<3> JUMPTABLE_SEL;
+
+  let Word0{23-0} = ADDR;
+  let Word0{26-24} = JUMPTABLE_SEL;
+}
+
+class CF_WORD1_EG {
+  field bits<32> Word1;
+
+  bits<3> POP_COUNT;
+  bits<5> CF_CONST;
+  bits<2> COND;
+  bits<6> COUNT;
+  bits<1> VALID_PIXEL_MODE;
+  bits<1> END_OF_PROGRAM;
+  bits<8> CF_INST;
+  bits<1> BARRIER;
+
+  let Word1{2-0} = POP_COUNT;
+  let Word1{7-3} = CF_CONST;
+  let Word1{9-8} = COND;
+  let Word1{15-10} = COUNT;
+  let Word1{20} = VALID_PIXEL_MODE;
+  let Word1{21} = END_OF_PROGRAM;
+  let Word1{29-22} = CF_INST;
+  let Word1{31} = BARRIER;
+}
+
+class CF_ALU_WORD0 {
+  field bits<32> Word0;
+
+  bits<22> ADDR;
+  bits<4> KCACHE_BANK0;
+  bits<4> KCACHE_BANK1;
+  bits<2> KCACHE_MODE0;
+
+  let Word0{21-0} = ADDR;
+  let Word0{25-22} = KCACHE_BANK0;
+  let Word0{29-26} = KCACHE_BANK1;
+  let Word0{31-30} = KCACHE_MODE0;
+}
+
+class CF_ALU_WORD1 {
+  field bits<32> Word1;
+
+  bits<2> KCACHE_MODE1;
+  bits<8> KCACHE_ADDR0;
+  bits<8> KCACHE_ADDR1;
+  bits<7> COUNT;
+  bits<1> ALT_CONST;
+  bits<4> CF_INST;
+  bits<1> WHOLE_QUAD_MODE;
+  bits<1> BARRIER;
+
+  let Word1{1-0} = KCACHE_MODE1;
+  let Word1{9-2} = KCACHE_ADDR0;
+  let Word1{17-10} = KCACHE_ADDR1;
+  let Word1{24-18} = COUNT;
+  let Word1{25} = ALT_CONST;
+  let Word1{29-26} = CF_INST;
+  let Word1{30} = WHOLE_QUAD_MODE;
+  let Word1{31} = BARRIER;
+}
+
+class CF_ALLOC_EXPORT_WORD0_RAT {
+  field bits<32> Word0;
+
+  bits<4> rat_id;
+  bits<6> rat_inst;
+  bits<2> rim;
+  bits<2> type;
+  bits<7> rw_gpr;
+  bits<1> rw_rel;
+  bits<7> index_gpr;
+  bits<2> elem_size;
+
+  let Word0{3-0}   = rat_id;
+  let Word0{9-4}   = rat_inst;
+  let Word0{10}    = 0; // Reserved
+  let Word0{12-11} = rim;
+  let Word0{14-13} = type;
+  let Word0{21-15} = rw_gpr;
+  let Word0{22}    = rw_rel;
+  let Word0{29-23} = index_gpr;
+  let Word0{31-30} = elem_size;
+}
+
+class CF_ALLOC_EXPORT_WORD1_BUF {
+  field bits<32> Word1;
+
+  bits<12> array_size;
+  bits<4>  comp_mask;
+  bits<4>  burst_count;
+  bits<1>  vpm;
+  bits<1>  eop;
+  bits<8>  cf_inst;
+  bits<1>  mark;
+  bits<1>  barrier;
+
+  let Word1{11-0} = array_size;
+  let Word1{15-12} = comp_mask;
+  let Word1{19-16} = burst_count;
+  let Word1{20}    = vpm;
+  let Word1{21}    = eop;
+  let Word1{29-22} = cf_inst;
+  let Word1{30}    = mark;
+  let Word1{31}    = barrier;
+}
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
new file mode 100644
index 00000000000..5ef883cbcad
--- /dev/null
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -0,0 +1,1435 @@
+//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Implementation of TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600InstrInfo.h"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AMDGPUGenDFAPacketizer.inc"
+
+R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st)
+    : AMDGPUInstrInfo(st), RI() {}
+
+const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+bool R600InstrInfo::isTrig(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG;
+}
+
+bool R600InstrInfo::isVector(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
+}
+
+void
+R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const {
+  unsigned VectorComponents = 0;
+  if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
+      AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
+      (AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
+       AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
+    VectorComponents = 4;
+  } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
+            AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
+            (AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
+             AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
+    VectorComponents = 2;
+  }
+
+  if (VectorComponents > 0) {
+    for (unsigned I = 0; I < VectorComponents; I++) {
+      unsigned SubRegIndex = RI.getSubRegFromChannel(I);
+      buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+                              RI.getSubReg(DestReg, SubRegIndex),
+                              RI.getSubReg(SrcReg, SubRegIndex))
+                              .addReg(DestReg,
+                                      RegState::Define | RegState::Implicit);
+    }
+  } else {
+    MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+                                                  DestReg, SrcReg);
+    NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0))
+                                    .setIsKill(KillSrc);
+  }
+}
+
+/// \returns true if \p MBBI can be moved into a new basic.
+bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI) const {
+  for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(),
+                                        E = MBBI->operands_end(); I != E; ++I) {
+    if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) &&
+        I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg()))
+      return false;
+  }
+  return true;
+}
+
+bool R600InstrInfo::isMov(unsigned Opcode) const {
+
+
+  switch(Opcode) {
+  default: return false;
+  case AMDGPU::MOV:
+  case AMDGPU::MOV_IMM_F32:
+  case AMDGPU::MOV_IMM_I32:
+    return true;
+  }
+}
+
+// Some instructions act as place holders to emulate operations that the GPU
+// hardware does automatically. This function can be used to check if
+// an opcode falls into this category.
+bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return false;
+  case AMDGPU::RETURN:
+    return true;
+  }
+}
+
+bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
+  return false;
+}
+
+bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
+  switch(Opcode) {
+    default: return false;
+    case AMDGPU::CUBE_r600_pseudo:
+    case AMDGPU::CUBE_r600_real:
+    case AMDGPU::CUBE_eg_pseudo:
+    case AMDGPU::CUBE_eg_real:
+      return true;
+  }
+}
+
+bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return (TargetFlags & R600_InstFlag::ALU_INST);
+}
+
+bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return ((TargetFlags & R600_InstFlag::OP1) |
+          (TargetFlags & R600_InstFlag::OP2) |
+          (TargetFlags & R600_InstFlag::OP3));
+}
+
+bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return ((TargetFlags & R600_InstFlag::LDS_1A) |
+          (TargetFlags & R600_InstFlag::LDS_1A1D) |
+          (TargetFlags & R600_InstFlag::LDS_1A2D));
+}
+
+bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const {
+  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1;
+}
+
+bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const {
+  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1;
+}
+
+bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const {
+  if (isALUInstr(MI->getOpcode()))
+    return true;
+  if (isVector(*MI) || isCubeOp(MI->getOpcode()))
+    return true;
+  switch (MI->getOpcode()) {
+  case AMDGPU::PRED_X:
+  case AMDGPU::INTERP_PAIR_XY:
+  case AMDGPU::INTERP_PAIR_ZW:
+  case AMDGPU::INTERP_VEC_LOAD:
+  case AMDGPU::COPY:
+  case AMDGPU::DOT_4:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
+  if (ST.hasCaymanISA())
+    return false;
+  return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU);
+}
+
+bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const {
+  return isTransOnly(MI->getOpcode());
+}
+
+bool R600InstrInfo::isVectorOnly(unsigned Opcode) const {
+  return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU);
+}
+
+bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const {
+  return isVectorOnly(MI->getOpcode());
+}
+
+bool R600InstrInfo::isExport(unsigned Opcode) const {
+  return (get(Opcode).TSFlags & R600_InstFlag::IS_EXPORT);
+}
+
+bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
+  return ST.hasVertexCache() && IS_VTX(get(Opcode));
+}
+
+bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const {
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
+  return MFI->getShaderType() != ShaderType::COMPUTE &&
+    usesVertexCache(MI->getOpcode());
+}
+
+bool R600InstrInfo::usesTextureCache(unsigned Opcode) const {
+  return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode));
+}
+
+bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const {
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
+  return (MFI->getShaderType() == ShaderType::COMPUTE &&
+          usesVertexCache(MI->getOpcode())) ||
+    usesTextureCache(MI->getOpcode());
+}
+
+bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
+  switch (Opcode) {
+  case AMDGPU::KILLGT:
+  case AMDGPU::GROUP_BARRIER:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const {
+  return  MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
+}
+
+bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const {
+  return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
+}
+
+bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const {
+  if (!isALUInstr(MI->getOpcode())) {
+    return false;
+  }
+  for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
+                                        E = MI->operands_end(); I != E; ++I) {
+    if (!I->isReg() || !I->isUse() ||
+        TargetRegisterInfo::isVirtualRegister(I->getReg()))
+      continue;
+
+    if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
+      return true;
+  }
+  return false;
+}
+
+int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const {
+  static const unsigned OpTable[] = {
+    AMDGPU::OpName::src0,
+    AMDGPU::OpName::src1,
+    AMDGPU::OpName::src2
+  };
+
+  assert (SrcNum < 3);
+  return getOperandIdx(Opcode, OpTable[SrcNum]);
+}
+
+int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const {
+  static const unsigned SrcSelTable[][2] = {
+    {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
+    {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
+    {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
+    {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
+    {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
+    {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
+    {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
+    {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
+    {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
+    {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
+    {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}
+  };
+
+  for (const auto &Row : SrcSelTable) {
+    if (getOperandIdx(Opcode, Row[0]) == (int)SrcIdx) {
+      return getOperandIdx(Opcode, Row[1]);
+    }
+  }
+  return -1;
+}
+
+SmallVector<std::pair<MachineOperand *, int64_t>, 3>
+R600InstrInfo::getSrcs(MachineInstr *MI) const {
+  SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
+
+  if (MI->getOpcode() == AMDGPU::DOT_4) {
+    static const unsigned OpTable[8][2] = {
+      {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
+      {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
+      {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
+      {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
+      {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
+      {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
+      {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
+      {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W},
+    };
+
+    for (unsigned j = 0; j < 8; j++) {
+      MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
+                                                        OpTable[j][0]));
+      unsigned Reg = MO.getReg();
+      if (Reg == AMDGPU::ALU_CONST) {
+        unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(),
+                                                    OpTable[j][1])).getImm();
+        Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
+        continue;
+      }
+
+    }
+    return Result;
+  }
+
+  static const unsigned OpTable[3][2] = {
+    {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
+    {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
+    {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
+  };
+
+  for (unsigned j = 0; j < 3; j++) {
+    int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
+    if (SrcIdx < 0)
+      break;
+    MachineOperand &MO = MI->getOperand(SrcIdx);
+    unsigned Reg = MI->getOperand(SrcIdx).getReg();
+    if (Reg == AMDGPU::ALU_CONST) {
+      unsigned Sel = MI->getOperand(
+          getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
+      Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
+      continue;
+    }
+    if (Reg == AMDGPU::ALU_LITERAL_X) {
+      unsigned Imm = MI->getOperand(
+          getOperandIdx(MI->getOpcode(), AMDGPU::OpName::literal)).getImm();
+      Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Imm));
+      continue;
+    }
+    Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, 0));
+  }
+  return Result;
+}
+
+std::vector<std::pair<int, unsigned> >
+R600InstrInfo::ExtractSrcs(MachineInstr *MI,
+                           const DenseMap<unsigned, unsigned> &PV,
+                           unsigned &ConstCount) const {
+  ConstCount = 0;
+  ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI);
+  const std::pair<int, unsigned> DummyPair(-1, 0);
+  std::vector<std::pair<int, unsigned> > Result;
+  unsigned i = 0;
+  for (unsigned n = Srcs.size(); i < n; ++i) {
+    unsigned Reg = Srcs[i].first->getReg();
+    unsigned Index = RI.getEncodingValue(Reg) & 0xff;
+    if (Reg == AMDGPU::OQAP) {
+      Result.push_back(std::pair<int, unsigned>(Index, 0));
+    }
+    if (PV.find(Reg) != PV.end()) {
+      // 255 is used to tells its a PS/PV reg
+      Result.push_back(std::pair<int, unsigned>(255, 0));
+      continue;
+    }
+    if (Index > 127) {
+      ConstCount++;
+      Result.push_back(DummyPair);
+      continue;
+    }
+    unsigned Chan = RI.getHWRegChan(Reg);
+    Result.push_back(std::pair<int, unsigned>(Index, Chan));
+  }
+  for (; i < 3; ++i)
+    Result.push_back(DummyPair);
+  return Result;
+}
+
+static std::vector<std::pair<int, unsigned> >
+Swizzle(std::vector<std::pair<int, unsigned> > Src,
+        R600InstrInfo::BankSwizzle Swz) {
+  if (Src[0] == Src[1])
+    Src[1].first = -1;
+  switch (Swz) {
+  case R600InstrInfo::ALU_VEC_012_SCL_210:
+    break;
+  case R600InstrInfo::ALU_VEC_021_SCL_122:
+    std::swap(Src[1], Src[2]);
+    break;
+  case R600InstrInfo::ALU_VEC_102_SCL_221:
+    std::swap(Src[0], Src[1]);
+    break;
+  case R600InstrInfo::ALU_VEC_120_SCL_212:
+    std::swap(Src[0], Src[1]);
+    std::swap(Src[0], Src[2]);
+    break;
+  case R600InstrInfo::ALU_VEC_201:
+    std::swap(Src[0], Src[2]);
+    std::swap(Src[0], Src[1]);
+    break;
+  case R600InstrInfo::ALU_VEC_210:
+    std::swap(Src[0], Src[2]);
+    break;
+  }
+  return Src;
+}
+
+static unsigned
+getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
+  switch (Swz) {
+  case R600InstrInfo::ALU_VEC_012_SCL_210: {
+    unsigned Cycles[3] = { 2, 1, 0};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_021_SCL_122: {
+    unsigned Cycles[3] = { 1, 2, 2};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_120_SCL_212: {
+    unsigned Cycles[3] = { 2, 1, 2};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_102_SCL_221: {
+    unsigned Cycles[3] = { 2, 2, 1};
+    return Cycles[Op];
+  }
+  default:
+    llvm_unreachable("Wrong Swizzle for Trans Slot");
+    return 0;
+  }
+}
+
+/// returns how many MIs (whose inputs are represented by IGSrcs) can be packed
+/// in the same Instruction Group while meeting read port limitations given a
+/// Swz swizzle sequence.
+unsigned  R600InstrInfo::isLegalUpTo(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const {
+  int Vector[4][3];
+  memset(Vector, -1, sizeof(Vector));
+  for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) {
+    const std::vector<std::pair<int, unsigned> > &Srcs =
+        Swizzle(IGSrcs[i], Swz[i]);
+    for (unsigned j = 0; j < 3; j++) {
+      const std::pair<int, unsigned> &Src = Srcs[j];
+      if (Src.first < 0 || Src.first == 255)
+        continue;
+      if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
+        if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 &&
+            Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) {
+            // The value from output queue A (denoted by register OQAP) can
+            // only be fetched during the first cycle.
+            return false;
+        }
+        // OQAP does not count towards the normal read port restrictions
+        continue;
+      }
+      if (Vector[Src.second][j] < 0)
+        Vector[Src.second][j] = Src.first;
+      if (Vector[Src.second][j] != Src.first)
+        return i;
+    }
+  }
+  // Now check Trans Alu
+  for (unsigned i = 0, e = TransSrcs.size(); i < e; ++i) {
+    const std::pair<int, unsigned> &Src = TransSrcs[i];
+    unsigned Cycle = getTransSwizzle(TransSwz, i);
+    if (Src.first < 0)
+      continue;
+    if (Src.first == 255)
+      continue;
+    if (Vector[Src.second][Cycle] < 0)
+      Vector[Src.second][Cycle] = Src.first;
+    if (Vector[Src.second][Cycle] != Src.first)
+      return IGSrcs.size() - 1;
+  }
+  return IGSrcs.size();
+}
+
+/// Given a swizzle sequence SwzCandidate and an index Idx, returns the next
+/// (in lexicographic term) swizzle sequence assuming that all swizzles after
+/// Idx can be skipped
+static bool
+NextPossibleSolution(
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    unsigned Idx) {
+  assert(Idx < SwzCandidate.size());
+  int ResetIdx = Idx;
+  while (ResetIdx > -1 && SwzCandidate[ResetIdx] == R600InstrInfo::ALU_VEC_210)
+    ResetIdx --;
+  for (unsigned i = ResetIdx + 1, e = SwzCandidate.size(); i < e; i++) {
+    SwzCandidate[i] = R600InstrInfo::ALU_VEC_012_SCL_210;
+  }
+  if (ResetIdx == -1)
+    return false;
+  int NextSwizzle = SwzCandidate[ResetIdx] + 1;
+  SwzCandidate[ResetIdx] = (R600InstrInfo::BankSwizzle)NextSwizzle;
+  return true;
+}
+
+/// Enumerate all possible Swizzle sequence to find one that can meet all
+/// read port requirements.
+bool R600InstrInfo::FindSwizzleForVectorSlot(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const {
+  unsigned ValidUpTo = 0;
+  do {
+    ValidUpTo = isLegalUpTo(IGSrcs, SwzCandidate, TransSrcs, TransSwz);
+    if (ValidUpTo == IGSrcs.size())
+      return true;
+  } while (NextPossibleSolution(SwzCandidate, ValidUpTo));
+  return false;
+}
+
+/// Instructions in Trans slot can't read gpr at cycle 0 if they also read
+/// a const, and can't read a gpr at cycle 1 if they read 2 const.
+static bool
+isConstCompatible(R600InstrInfo::BankSwizzle TransSwz,
+                  const std::vector<std::pair<int, unsigned> > &TransOps,
+                  unsigned ConstCount) {
+  // TransALU can't read 3 constants
+  if (ConstCount > 2)
+    return false;
+  for (unsigned i = 0, e = TransOps.size(); i < e; ++i) {
+    const std::pair<int, unsigned> &Src = TransOps[i];
+    unsigned Cycle = getTransSwizzle(TransSwz, i);
+    if (Src.first < 0)
+      continue;
+    if (ConstCount > 0 && Cycle == 0)
+      return false;
+    if (ConstCount > 1 && Cycle == 1)
+      return false;
+  }
+  return true;
+}
+
+bool
+R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
+                                       const DenseMap<unsigned, unsigned> &PV,
+                                       std::vector<BankSwizzle> &ValidSwizzle,
+                                       bool isLastAluTrans)
+    const {
+  //Todo : support shared src0 - src1 operand
+
+  std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs;
+  ValidSwizzle.clear();
+  unsigned ConstCount;
+  BankSwizzle TransBS = ALU_VEC_012_SCL_210;
+  for (unsigned i = 0, e = IG.size(); i < e; ++i) {
+    IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount));
+    unsigned Op = getOperandIdx(IG[i]->getOpcode(),
+        AMDGPU::OpName::bank_swizzle);
+    ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
+        IG[i]->getOperand(Op).getImm());
+  }
+  std::vector<std::pair<int, unsigned> > TransOps;
+  if (!isLastAluTrans)
+    return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS);
+
+  TransOps = std::move(IGSrcs.back());
+  IGSrcs.pop_back();
+  ValidSwizzle.pop_back();
+
+  static const R600InstrInfo::BankSwizzle TransSwz[] = {
+    ALU_VEC_012_SCL_210,
+    ALU_VEC_021_SCL_122,
+    ALU_VEC_120_SCL_212,
+    ALU_VEC_102_SCL_221
+  };
+  for (unsigned i = 0; i < 4; i++) {
+    TransBS = TransSwz[i];
+    if (!isConstCompatible(TransBS, TransOps, ConstCount))
+      continue;
+    bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps,
+        TransBS);
+    if (Result) {
+      ValidSwizzle.push_back(TransBS);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+bool
+R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
+    const {
+  assert (Consts.size() <= 12 && "Too many operands in instructions group");
+  unsigned Pair1 = 0, Pair2 = 0;
+  for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+    unsigned ReadConstHalf = Consts[i] & 2;
+    unsigned ReadConstIndex = Consts[i] & (~3);
+    unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf;
+    if (!Pair1) {
+      Pair1 = ReadHalfConst;
+      continue;
+    }
+    if (Pair1 == ReadHalfConst)
+      continue;
+    if (!Pair2) {
+      Pair2 = ReadHalfConst;
+      continue;
+    }
+    if (Pair2 != ReadHalfConst)
+      return false;
+  }
+  return true;
+}
+
+bool
+R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
+    const {
+  std::vector<unsigned> Consts;
+  SmallSet<int64_t, 4> Literals;
+  for (unsigned i = 0, n = MIs.size(); i < n; i++) {
+    MachineInstr *MI = MIs[i];
+    if (!isALUInstr(MI->getOpcode()))
+      continue;
+
+    ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI);
+
+    for (unsigned j = 0, e = Srcs.size(); j < e; j++) {
+      std::pair<MachineOperand *, unsigned> Src = Srcs[j];
+      if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X)
+        Literals.insert(Src.second);
+      if (Literals.size() > 4)
+        return false;
+      if (Src.first->getReg() == AMDGPU::ALU_CONST)
+        Consts.push_back(Src.second);
+      if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) ||
+          AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) {
+        unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff;
+        unsigned Chan = RI.getHWRegChan(Src.first->getReg());
+        Consts.push_back((Index << 2) | Chan);
+      }
+    }
+  }
+  return fitsConstReadLimitations(Consts);
+}
+
+DFAPacketizer *
+R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const {
+  const InstrItineraryData *II = STI.getInstrItineraryData();
+  return static_cast<const AMDGPUSubtarget &>(STI).createDFAPacketizer(II);
+}
+
+static bool
+isPredicateSetter(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::PRED_X:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static MachineInstr *
+findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I) {
+  while (I != MBB.begin()) {
+    --I;
+    MachineInstr *MI = I;
+    if (isPredicateSetter(MI->getOpcode()))
+      return MI;
+  }
+
+  return nullptr;
+}
+
+static
+bool isJump(unsigned Opcode) {
+  return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
+}
+
+static bool isBranch(unsigned Opcode) {
+  return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 ||
+      Opcode == AMDGPU::BRANCH_COND_f32;
+}
+
+bool
+R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                             MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const {
+  // Most of the following comes from the ARM implementation of AnalyzeBranch
+
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  // AMDGPU::BRANCH* instructions are only available after isel and are not
+  // handled
+  if (isBranch(I->getOpcode()))
+    return true;
+  if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) {
+    return false;
+  }
+
+  // Remove successive JUMP
+  while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) {
+      MachineBasicBlock::iterator PriorI = std::prev(I);
+      if (AllowModify)
+        I->removeFromParent();
+      I = PriorI;
+  }
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() ||
+          !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) {
+    if (LastOpc == AMDGPU::JUMP) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastOpc == AMDGPU::JUMP_COND) {
+      MachineInstr *predSet = I;
+      while (!isPredicateSetter(predSet->getOpcode())) {
+        predSet = --I;
+      }
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(predSet->getOperand(1));
+      Cond.push_back(predSet->getOperand(2));
+      Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+      return false;
+    }
+    return true;  // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If the block ends with a B and a Bcc, handle it.
+  if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) {
+    MachineInstr *predSet = --I;
+    while (!isPredicateSetter(predSet->getOpcode())) {
+      predSet = --I;
+    }
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    FBB = LastInst->getOperand(0).getMBB();
+    Cond.push_back(predSet->getOperand(1));
+    Cond.push_back(predSet->getOperand(2));
+    Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+static
+MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
+  for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
+      It != E; ++It) {
+    if (It->getOpcode() == AMDGPU::CF_ALU ||
+        It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+      return std::prev(It.base());
+  }
+  return MBB.end();
+}
+
+unsigned
+R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                            MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB,
+                            ArrayRef<MachineOperand> Cond,
+                            DebugLoc DL) const {
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+  if (!FBB) {
+    if (Cond.empty()) {
+      BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
+      return 1;
+    } else {
+      MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
+      assert(PredSet && "No previous predicate !");
+      addFlag(PredSet, 0, MO_FLAG_PUSH);
+      PredSet->getOperand(2).setImm(Cond[1].getImm());
+
+      BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+             .addMBB(TBB)
+             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+      MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+      if (CfAlu == MBB.end())
+        return 1;
+      assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
+      CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+      return 1;
+    }
+  } else {
+    MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
+    assert(PredSet && "No previous predicate !");
+    addFlag(PredSet, 0, MO_FLAG_PUSH);
+    PredSet->getOperand(2).setImm(Cond[1].getImm());
+    BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+            .addMBB(TBB)
+            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
+    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+    if (CfAlu == MBB.end())
+      return 2;
+    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
+    CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+    return 2;
+  }
+}
+
+unsigned
+R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+
+  // Note : we leave PRED* instructions there.
+  // They may be needed when predicating instructions.
+
+  MachineBasicBlock::iterator I = MBB.end();
+
+  if (I == MBB.begin()) {
+    return 0;
+  }
+  --I;
+  switch (I->getOpcode()) {
+  default:
+    return 0;
+  case AMDGPU::JUMP_COND: {
+    MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+    clearFlag(predSet, 0, MO_FLAG_PUSH);
+    I->eraseFromParent();
+    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+    if (CfAlu == MBB.end())
+      break;
+    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
+    CfAlu->setDesc(get(AMDGPU::CF_ALU));
+    break;
+  }
+  case AMDGPU::JUMP:
+    I->eraseFromParent();
+    break;
+  }
+  I = MBB.end();
+
+  if (I == MBB.begin()) {
+    return 1;
+  }
+  --I;
+  switch (I->getOpcode()) {
+    // FIXME: only one case??
+  default:
+    return 1;
+  case AMDGPU::JUMP_COND: {
+    MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+    clearFlag(predSet, 0, MO_FLAG_PUSH);
+    I->eraseFromParent();
+    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+    if (CfAlu == MBB.end())
+      break;
+    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
+    CfAlu->setDesc(get(AMDGPU::CF_ALU));
+    break;
+  }
+  case AMDGPU::JUMP:
+    I->eraseFromParent();
+    break;
+  }
+  return 2;
+}
+
+bool
+R600InstrInfo::isPredicated(const MachineInstr *MI) const {
+  int idx = MI->findFirstPredOperandIdx();
+  if (idx < 0)
+    return false;
+
+  unsigned Reg = MI->getOperand(idx).getReg();
+  switch (Reg) {
+  default: return false;
+  case AMDGPU::PRED_SEL_ONE:
+  case AMDGPU::PRED_SEL_ZERO:
+  case AMDGPU::PREDICATE_BIT:
+    return true;
+  }
+}
+
+bool
+R600InstrInfo::isPredicable(MachineInstr *MI) const {
+  // XXX: KILL* instructions can be predicated, but they must be the last
+  // instruction in a clause, so this means any instructions after them cannot
+  // be predicated.  Until we have proper support for instruction clauses in the
+  // backend, we will mark KILL* instructions as unpredicable.
+
+  if (MI->getOpcode() == AMDGPU::KILLGT) {
+    return false;
+  } else if (MI->getOpcode() == AMDGPU::CF_ALU) {
+    // If the clause start in the middle of MBB then the MBB has more
+    // than a single clause, unable to predicate several clauses.
+    if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI))
+      return false;
+    // TODO: We don't support KC merging atm
+    if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0)
+      return false;
+    return true;
+  } else if (isVector(*MI)) {
+    return false;
+  } else {
+    return AMDGPUInstrInfo::isPredicable(MI);
+  }
+}
+
+
+bool
+R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
+                                   unsigned NumCyles,
+                                   unsigned ExtraPredCycles,
+                                   const BranchProbability &Probability) const{
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                                   unsigned NumTCycles,
+                                   unsigned ExtraTCycles,
+                                   MachineBasicBlock &FMBB,
+                                   unsigned NumFCycles,
+                                   unsigned ExtraFCycles,
+                                   const BranchProbability &Probability) const {
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+                                         unsigned NumCyles,
+                                         const BranchProbability &Probability)
+                                         const {
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                         MachineBasicBlock &FMBB) const {
+  return false;
+}
+
+
+bool
+R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  MachineOperand &MO = Cond[1];
+  switch (MO.getImm()) {
+  case OPCODE_IS_ZERO_INT:
+    MO.setImm(OPCODE_IS_NOT_ZERO_INT);
+    break;
+  case OPCODE_IS_NOT_ZERO_INT:
+    MO.setImm(OPCODE_IS_ZERO_INT);
+    break;
+  case OPCODE_IS_ZERO:
+    MO.setImm(OPCODE_IS_NOT_ZERO);
+    break;
+  case OPCODE_IS_NOT_ZERO:
+    MO.setImm(OPCODE_IS_ZERO);
+    break;
+  default:
+    return true;
+  }
+
+  MachineOperand &MO2 = Cond[2];
+  switch (MO2.getReg()) {
+  case AMDGPU::PRED_SEL_ZERO:
+    MO2.setReg(AMDGPU::PRED_SEL_ONE);
+    break;
+  case AMDGPU::PRED_SEL_ONE:
+    MO2.setReg(AMDGPU::PRED_SEL_ZERO);
+    break;
+  default:
+    return true;
+  }
+  return false;
+}
+
+bool
+R600InstrInfo::DefinesPredicate(MachineInstr *MI,
+                                std::vector<MachineOperand> &Pred) const {
+  return isPredicateSetter(MI->getOpcode());
+}
+
+
+bool
+R600InstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+                                 ArrayRef<MachineOperand> Pred2) const {
+  return false;
+}
+
+
+bool
+R600InstrInfo::PredicateInstruction(MachineInstr *MI,
+                                    ArrayRef<MachineOperand> Pred) const {
+  int PIdx = MI->findFirstPredOperandIdx();
+
+  if (MI->getOpcode() == AMDGPU::CF_ALU) {
+    MI->getOperand(8).setImm(0);
+    return true;
+  }
+
+  if (MI->getOpcode() == AMDGPU::DOT_4) {
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W))
+        .setReg(Pred[2].getReg());
+    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    return true;
+  }
+
+  if (PIdx != -1) {
+    MachineOperand &PMO = MI->getOperand(PIdx);
+    PMO.setReg(Pred[2].getReg());
+    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    return true;
+  }
+
+  return false;
+}
+
+unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const {
+  return 2;
+}
+
+unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                            const MachineInstr *MI,
+                                            unsigned *PredCost) const {
+  if (PredCost)
+    *PredCost = 2;
+  return 2;
+}
+
+bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+
+  switch(MI->getOpcode()) {
+  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+  case AMDGPU::R600_EXTRACT_ELT_V2:
+  case AMDGPU::R600_EXTRACT_ELT_V4:
+    buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(),
+                      RI.getHWRegIndex(MI->getOperand(1).getReg()), //  Address
+                      MI->getOperand(2).getReg(),
+                      RI.getHWRegChan(MI->getOperand(1).getReg()));
+    break;
+  case AMDGPU::R600_INSERT_ELT_V2:
+  case AMDGPU::R600_INSERT_ELT_V4:
+    buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value
+                       RI.getHWRegIndex(MI->getOperand(1).getReg()),  // Address
+                       MI->getOperand(3).getReg(),                    // Offset
+                       RI.getHWRegChan(MI->getOperand(1).getReg()));  // Channel
+    break;
+  }
+  MI->eraseFromParent();
+  return true;
+}
+
+void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
+                                             const MachineFunction &MF) const {
+  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
+      MF.getSubtarget().getFrameLowering());
+
+  unsigned StackWidth = TFL->getStackWidth(MF);
+  int End = getIndirectIndexEnd(MF);
+
+  if (End == -1)
+    return;
+
+  for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
+    unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
+    Reserved.set(SuperReg);
+    for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
+      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
+      Reserved.set(Reg);
+    }
+  }
+}
+
+unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
+                                                 unsigned Channel) const {
+  // XXX: Remove when we support a stack width > 2
+  assert(Channel == 0);
+  return RegIndex;
+}
+
+const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const {
+  return &AMDGPU::R600_TReg32_XRegClass;
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg) const {
+  return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0);
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg,
+                                       unsigned AddrChan) const {
+  unsigned AddrReg;
+  switch (AddrChan) {
+    default: llvm_unreachable("Invalid Channel");
+    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
+    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
+    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
+    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+  }
+  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
+                                               AMDGPU::AR_X, OffsetReg);
+  setImmOperand(MOVA, AMDGPU::OpName::write, 0);
+
+  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+                                      AddrReg, ValueReg)
+                                      .addReg(AMDGPU::AR_X,
+                                           RegState::Implicit | RegState::Kill);
+  setImmOperand(Mov, AMDGPU::OpName::dst_rel, 1);
+  return Mov;
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg) const {
+  return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0);
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg,
+                                       unsigned AddrChan) const {
+  unsigned AddrReg;
+  switch (AddrChan) {
+    default: llvm_unreachable("Invalid Channel");
+    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
+    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
+    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
+    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+  }
+  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
+                                                       AMDGPU::AR_X,
+                                                       OffsetReg);
+  setImmOperand(MOVA, AMDGPU::OpName::write, 0);
+  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+                                      ValueReg,
+                                      AddrReg)
+                                      .addReg(AMDGPU::AR_X,
+                                           RegState::Implicit | RegState::Kill);
+  setImmOperand(Mov, AMDGPU::OpName::src0_rel, 1);
+
+  return Mov;
+}
+
+unsigned R600InstrInfo::getMaxAlusPerClause() const {
+  return 115;
+}
+
+MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB,
+                                                  MachineBasicBlock::iterator I,
+                                                  unsigned Opcode,
+                                                  unsigned DstReg,
+                                                  unsigned Src0Reg,
+                                                  unsigned Src1Reg) const {
+  MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode),
+    DstReg);           // $dst
+
+  if (Src1Reg) {
+    MIB.addImm(0)     // $update_exec_mask
+       .addImm(0);    // $update_predicate
+  }
+  MIB.addImm(1)        // $write
+     .addImm(0)        // $omod
+     .addImm(0)        // $dst_rel
+     .addImm(0)        // $dst_clamp
+     .addReg(Src0Reg)  // $src0
+     .addImm(0)        // $src0_neg
+     .addImm(0)        // $src0_rel
+     .addImm(0)        // $src0_abs
+     .addImm(-1);       // $src0_sel
+
+  if (Src1Reg) {
+    MIB.addReg(Src1Reg) // $src1
+       .addImm(0)       // $src1_neg
+       .addImm(0)       // $src1_rel
+       .addImm(0)       // $src1_abs
+       .addImm(-1);      // $src1_sel
+  }
+
+  //XXX: The r600g finalizer expects this to be 1, once we've moved the
+  //scheduling to the backend, we can change the default to 0.
+  MIB.addImm(1)        // $last
+      .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
+      .addImm(0)         // $literal
+      .addImm(0);        // $bank_swizzle
+
+  return MIB;
+}
+
+#define OPERAND_CASE(Label) \
+  case Label: { \
+    static const unsigned Ops[] = \
+    { \
+      Label##_X, \
+      Label##_Y, \
+      Label##_Z, \
+      Label##_W \
+    }; \
+    return Ops[Slot]; \
+  }
+
+static unsigned getSlotedOps(unsigned  Op, unsigned Slot) {
+  switch (Op) {
+  OPERAND_CASE(AMDGPU::OpName::update_exec_mask)
+  OPERAND_CASE(AMDGPU::OpName::update_pred)
+  OPERAND_CASE(AMDGPU::OpName::write)
+  OPERAND_CASE(AMDGPU::OpName::omod)
+  OPERAND_CASE(AMDGPU::OpName::dst_rel)
+  OPERAND_CASE(AMDGPU::OpName::clamp)
+  OPERAND_CASE(AMDGPU::OpName::src0)
+  OPERAND_CASE(AMDGPU::OpName::src0_neg)
+  OPERAND_CASE(AMDGPU::OpName::src0_rel)
+  OPERAND_CASE(AMDGPU::OpName::src0_abs)
+  OPERAND_CASE(AMDGPU::OpName::src0_sel)
+  OPERAND_CASE(AMDGPU::OpName::src1)
+  OPERAND_CASE(AMDGPU::OpName::src1_neg)
+  OPERAND_CASE(AMDGPU::OpName::src1_rel)
+  OPERAND_CASE(AMDGPU::OpName::src1_abs)
+  OPERAND_CASE(AMDGPU::OpName::src1_sel)
+  OPERAND_CASE(AMDGPU::OpName::pred_sel)
+  default:
+    llvm_unreachable("Wrong Operand");
+  }
+}
+
+#undef OPERAND_CASE
+
+MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
+    MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg)
+    const {
+  assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
+  unsigned Opcode;
+  if (ST.getGeneration() <= AMDGPUSubtarget::R700)
+    Opcode = AMDGPU::DOT4_r600;
+  else
+    Opcode = AMDGPU::DOT4_eg;
+  MachineBasicBlock::iterator I = MI;
+  MachineOperand &Src0 = MI->getOperand(
+      getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot)));
+  MachineOperand &Src1 = MI->getOperand(
+      getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot)));
+  MachineInstr *MIB = buildDefaultInstruction(
+      MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
+  static const unsigned  Operands[14] = {
+    AMDGPU::OpName::update_exec_mask,
+    AMDGPU::OpName::update_pred,
+    AMDGPU::OpName::write,
+    AMDGPU::OpName::omod,
+    AMDGPU::OpName::dst_rel,
+    AMDGPU::OpName::clamp,
+    AMDGPU::OpName::src0_neg,
+    AMDGPU::OpName::src0_rel,
+    AMDGPU::OpName::src0_abs,
+    AMDGPU::OpName::src0_sel,
+    AMDGPU::OpName::src1_neg,
+    AMDGPU::OpName::src1_rel,
+    AMDGPU::OpName::src1_abs,
+    AMDGPU::OpName::src1_sel,
+  };
+
+  MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
+      getSlotedOps(AMDGPU::OpName::pred_sel, Slot)));
+  MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel))
+      .setReg(MO.getReg());
+
+  for (unsigned i = 0; i < 14; i++) {
+    MachineOperand &MO = MI->getOperand(
+        getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot)));
+    assert (MO.isImm());
+    setImmOperand(MIB, Operands[i], MO.getImm());
+  }
+  MIB->getOperand(20).setImm(0);
+  return MIB;
+}
+
+MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned DstReg,
+                                         uint64_t Imm) const {
+  MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
+                                                  AMDGPU::ALU_LITERAL_X);
+  setImmOperand(MovImm, AMDGPU::OpName::literal, Imm);
+  return MovImm;
+}
+
+MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned DstReg, unsigned SrcReg) const {
+  return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg);
+}
+
+int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
+  return getOperandIdx(MI.getOpcode(), Op);
+}
+
+int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const {
+  return AMDGPU::getNamedOperandIdx(Opcode, Op);
+}
+
+void R600InstrInfo::setImmOperand(MachineInstr *MI, unsigned Op,
+                                  int64_t Imm) const {
+  int Idx = getOperandIdx(*MI, Op);
+  assert(Idx != -1 && "Operand not supported for this instruction.");
+  assert(MI->getOperand(Idx).isImm());
+  MI->getOperand(Idx).setImm(Imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction flag getters/setters
+//===----------------------------------------------------------------------===//
+
+bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const {
+  return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0;
+}
+
+MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
+                                         unsigned Flag) const {
+  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  int FlagIndex = 0;
+  if (Flag != 0) {
+    // If we pass something other than the default value of Flag to this
+    // function, it means we are want to set a flag on an instruction
+    // that uses native encoding.
+    assert(HAS_NATIVE_OPERANDS(TargetFlags));
+    bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
+    switch (Flag) {
+    case MO_FLAG_CLAMP:
+      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::clamp);
+      break;
+    case MO_FLAG_MASK:
+      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::write);
+      break;
+    case MO_FLAG_NOT_LAST:
+    case MO_FLAG_LAST:
+      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::last);
+      break;
+    case MO_FLAG_NEG:
+      switch (SrcIdx) {
+      case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_neg); break;
+      case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_neg); break;
+      case 2: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src2_neg); break;
+      }
+      break;
+
+    case MO_FLAG_ABS:
+      assert(!IsOP3 && "Cannot set absolute value modifier for OP3 "
+                       "instructions.");
+      (void)IsOP3;
+      switch (SrcIdx) {
+      case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_abs); break;
+      case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_abs); break;
+      }
+      break;
+
+    default:
+      FlagIndex = -1;
+      break;
+    }
+    assert(FlagIndex != -1 && "Flag not supported for this instruction");
+  } else {
+      FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags);
+      assert(FlagIndex != 0 &&
+         "Instruction flags not supported for this instruction");
+  }
+
+  MachineOperand &FlagOp = MI->getOperand(FlagIndex);
+  assert(FlagOp.isImm());
+  return FlagOp;
+}
+
+void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand,
+                            unsigned Flag) const {
+  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  if (Flag == 0) {
+    return;
+  }
+  if (HAS_NATIVE_OPERANDS(TargetFlags)) {
+    MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
+    if (Flag == MO_FLAG_NOT_LAST) {
+      clearFlag(MI, Operand, MO_FLAG_LAST);
+    } else if (Flag == MO_FLAG_MASK) {
+      clearFlag(MI, Operand, Flag);
+    } else {
+      FlagOp.setImm(1);
+    }
+  } else {
+      MachineOperand &FlagOp = getFlagOp(MI, Operand);
+      FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand)));
+  }
+}
+
+void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand,
+                              unsigned Flag) const {
+  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  if (HAS_NATIVE_OPERANDS(TargetFlags)) {
+    MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
+    FlagOp.setImm(0);
+  } else {
+    MachineOperand &FlagOp = getFlagOp(MI);
+    unsigned InstFlags = FlagOp.getImm();
+    InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand));
+    FlagOp.setImm(InstFlags);
+  }
+}
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
new file mode 100644
index 00000000000..dee4c2b9ae3
--- /dev/null
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -0,0 +1,303 @@
+//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for R600InstrInfo
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H
+#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H
+
+#include "AMDGPUInstrInfo.h"
+#include "R600Defines.h"
+#include "R600RegisterInfo.h"
+#include <map>
+
+namespace llvm {
+
+  class AMDGPUTargetMachine;
+  class DFAPacketizer;
+  class ScheduleDAG;
+  class MachineFunction;
+  class MachineInstr;
+  class MachineInstrBuilder;
+
+  class R600InstrInfo : public AMDGPUInstrInfo {
+  private:
+  const R600RegisterInfo RI;
+
+  std::vector<std::pair<int, unsigned> >
+  ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const;
+
+
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg,
+                                        unsigned AddrChan) const;
+
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg,
+                                        unsigned AddrChan) const;
+  public:
+  enum BankSwizzle {
+    ALU_VEC_012_SCL_210 = 0,
+    ALU_VEC_021_SCL_122,
+    ALU_VEC_120_SCL_212,
+    ALU_VEC_102_SCL_221,
+    ALU_VEC_201,
+    ALU_VEC_210
+  };
+
+  explicit R600InstrInfo(const AMDGPUSubtarget &st);
+
+  const R600RegisterInfo &getRegisterInfo() const override;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI) const override;
+
+  bool isTrig(const MachineInstr &MI) const;
+  bool isPlaceHolderOpcode(unsigned opcode) const;
+  bool isReductionOp(unsigned opcode) const;
+  bool isCubeOp(unsigned opcode) const;
+
+  /// \returns true if this \p Opcode represents an ALU instruction.
+  bool isALUInstr(unsigned Opcode) const;
+  bool hasInstrModifiers(unsigned Opcode) const;
+  bool isLDSInstr(unsigned Opcode) const;
+  bool isLDSNoRetInstr(unsigned Opcode) const;
+  bool isLDSRetInstr(unsigned Opcode) const;
+
+  /// \returns true if this \p Opcode represents an ALU instruction or an
+  /// instruction that will be lowered in ExpandSpecialInstrs Pass.
+  bool canBeConsideredALU(const MachineInstr *MI) const;
+
+  bool isTransOnly(unsigned Opcode) const;
+  bool isTransOnly(const MachineInstr *MI) const;
+  bool isVectorOnly(unsigned Opcode) const;
+  bool isVectorOnly(const MachineInstr *MI) const;
+  bool isExport(unsigned Opcode) const;
+
+  bool usesVertexCache(unsigned Opcode) const;
+  bool usesVertexCache(const MachineInstr *MI) const;
+  bool usesTextureCache(unsigned Opcode) const;
+  bool usesTextureCache(const MachineInstr *MI) const;
+
+  bool mustBeLastInClause(unsigned Opcode) const;
+  bool usesAddressRegister(MachineInstr *MI) const;
+  bool definesAddressRegister(MachineInstr *MI) const;
+  bool readsLDSSrcReg(const MachineInstr *MI) const;
+
+  /// \returns The operand index for the given source number.  Legal values
+  /// for SrcNum are 0, 1, and 2.
+  int getSrcIdx(unsigned Opcode, unsigned SrcNum) const;
+  /// \returns The operand Index for the Sel operand given an index to one
+  /// of the instruction's src operands.
+  int getSelIdx(unsigned Opcode, unsigned SrcIdx) const;
+
+  /// \returns a pair for each src of an ALU instructions.
+  /// The first member of a pair is the register id.
+  /// If register is ALU_CONST, second member is SEL.
+  /// If register is ALU_LITERAL, second member is IMM.
+  /// Otherwise, second member value is undefined.
+  SmallVector<std::pair<MachineOperand *, int64_t>, 3>
+      getSrcs(MachineInstr *MI) const;
+
+  unsigned  isLegalUpTo(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const;
+
+  bool FindSwizzleForVectorSlot(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const;
+
+  /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210
+  /// returns true and the first (in lexical order) BankSwizzle affectation
+  /// starting from the one already provided in the Instruction Group MIs that
+  /// fits Read Port limitations in BS if available. Otherwise returns false
+  /// and undefined content in BS.
+  /// isLastAluTrans should be set if the last Alu of MIs will be executed on
+  /// Trans ALU. In this case, ValidTSwizzle returns the BankSwizzle value to
+  /// apply to the last instruction.
+  /// PV holds GPR to PV registers in the Instruction Group MIs.
+  bool fitsReadPortLimitations(const std::vector<MachineInstr *> &MIs,
+                               const DenseMap<unsigned, unsigned> &PV,
+                               std::vector<BankSwizzle> &BS,
+                               bool isLastAluTrans) const;
+
+  /// An instruction group can only access 2 channel pair (either [XY] or [ZW])
+  /// from KCache bank on R700+. This function check if MI set in input meet
+  /// this limitations
+  bool fitsConstReadLimitations(const std::vector<MachineInstr *> &) const;
+  /// Same but using const index set instead of MI set.
+  bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
+
+  /// \brief Vector instructions are instructions that must fill all
+  /// instruction slots within an instruction group.
+  bool isVector(const MachineInstr &MI) const;
+
+  bool isMov(unsigned Opcode) const override;
+
+  DFAPacketizer *
+  CreateTargetScheduleState(const TargetSubtargetInfo &) const override;
+
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        DebugLoc DL) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  bool isPredicated(const MachineInstr *MI) const override;
+
+  bool isPredicable(MachineInstr *MI) const override;
+
+  bool
+   isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+                             const BranchProbability &Probability) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+                           unsigned ExtraPredCycles,
+                           const BranchProbability &Probability) const override ;
+
+  bool
+   isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                       unsigned NumTCycles, unsigned ExtraTCycles,
+                       MachineBasicBlock &FMBB,
+                       unsigned NumFCycles, unsigned ExtraFCycles,
+                       const BranchProbability &Probability) const override;
+
+  bool DefinesPredicate(MachineInstr *MI,
+                                  std::vector<MachineOperand> &Pred) const override;
+
+  bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+                         ArrayRef<MachineOperand> Pred2) const override;
+
+  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                          MachineBasicBlock &FMBB) const override;
+
+  bool PredicateInstruction(MachineInstr *MI,
+                            ArrayRef<MachineOperand> Pred) const override;
+
+  unsigned int getPredicationCost(const MachineInstr *) const override;
+
+  unsigned int getInstrLatency(const InstrItineraryData *ItinData,
+                               const MachineInstr *MI,
+                               unsigned *PredCost = nullptr) const override;
+
+  int getInstrLatency(const InstrItineraryData *ItinData,
+                      SDNode *Node) const override { return 1;}
+
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+
+  /// \brief Reserve the registers that may be accesed using indirect addressing.
+  void reserveIndirectRegisters(BitVector &Reserved,
+                                const MachineFunction &MF) const;
+
+  unsigned calculateIndirectAddress(unsigned RegIndex,
+                                    unsigned Channel) const override;
+
+  const TargetRegisterClass *getIndirectAddrRegClass() const override;
+
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                          MachineBasicBlock::iterator I,
+                          unsigned ValueReg, unsigned Address,
+                          unsigned OffsetReg) const override;
+
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg) const override;
+
+  unsigned getMaxAlusPerClause() const;
+
+  ///buildDefaultInstruction - This function returns a MachineInstr with
+  /// all the instruction modifiers initialized to their default values.
+  /// You can use this function to avoid manually specifying each instruction
+  /// modifier operand when building a new instruction.
+  ///
+  /// \returns a MachineInstr with all the instruction modifiers initialized
+  /// to their default values.
+  MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB,
+                                              MachineBasicBlock::iterator I,
+                                              unsigned Opcode,
+                                              unsigned DstReg,
+                                              unsigned Src0Reg,
+                                              unsigned Src1Reg = 0) const;
+
+  MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB,
+                                             MachineInstr *MI,
+                                             unsigned Slot,
+                                             unsigned DstReg) const;
+
+  MachineInstr *buildMovImm(MachineBasicBlock &BB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned DstReg,
+                                  uint64_t Imm) const;
+
+  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DstReg, unsigned SrcReg) const override;
+
+  /// \brief Get the index of Op in the MachineInstr.
+  ///
+  /// \returns -1 if the Instruction does not contain the specified \p Op.
+  int getOperandIdx(const MachineInstr &MI, unsigned Op) const;
+
+  /// \brief Get the index of \p Op for the given Opcode.
+  ///
+  /// \returns -1 if the Instruction does not contain the specified \p Op.
+  int getOperandIdx(unsigned Opcode, unsigned Op) const;
+
+  /// \brief Helper function for setting instruction flag values.
+  void setImmOperand(MachineInstr *MI, unsigned Op, int64_t Imm) const;
+
+  /// \returns true if this instruction has an operand for storing target flags.
+  bool hasFlagOperand(const MachineInstr &MI) const;
+
+  ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
+  void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
+
+  ///\brief Determine if the specified \p Flag is set on this \p Operand.
+  bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
+
+  /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2)
+  /// \param Flag The flag being set.
+  ///
+  /// \returns the operand containing the flags for this instruction.
+  MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0,
+                            unsigned Flag = 0) const;
+
+  /// \brief Clear the specified flag on the instruction.
+  void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
+};
+
+namespace AMDGPU {
+
+int getLDSNoRetOp(uint16_t Opcode);
+
+} //End namespace AMDGPU
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
new file mode 100644
index 00000000000..7beed092b3f
--- /dev/null
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -0,0 +1,1744 @@
+//===-- R600Instructions.td - R600 Instruction defs  -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are available on R600 family
+// GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+include "R600Intrinsics.td"
+include "R600InstrFormats.td"
+
+class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstR600 <outs, ins, asm, pattern, NullALU> {
+
+  let Namespace = "AMDGPU";
+}
+
+def MEMxi : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index);
+  let PrintMethod = "printMemOperand";
+}
+
+def MEMrr : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index);
+}
+
+// Operands for non-registers
+
+class InstFlag<string PM = "printOperand", int Default = 0>
+    : OperandWithDefaultOps <i32, (ops (i32 Default))> {
+  let PrintMethod = PM;
+}
+
+// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers
+def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> {
+  let PrintMethod = "printSel";
+}
+def BANK_SWIZZLE : OperandWithDefaultOps <i32, (ops (i32 0))> {
+  let PrintMethod = "printBankSwizzle";
+}
+
+def LITERAL : InstFlag<"printLiteral">;
+
+def WRITE : InstFlag <"printWrite", 1>;
+def OMOD : InstFlag <"printOMOD">;
+def REL : InstFlag <"printRel">;
+def CLAMP : InstFlag <"printClamp">;
+def NEG : InstFlag <"printNeg">;
+def ABS : InstFlag <"printAbs">;
+def UEM : InstFlag <"printUpdateExecMask">;
+def UP : InstFlag <"printUpdatePred">;
+
+// XXX: The r600g finalizer in Mesa expects last to be one in most cases.
+// Once we start using the packetizer in this backend we should have this
+// default to 0.
+def LAST : InstFlag<"printLast", 1>;
+def RSel : Operand<i32> {
+  let PrintMethod = "printRSel";
+}
+def CT: Operand<i32> {
+  let PrintMethod = "printCT";
+}
+
+def FRAMEri : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index);
+}
+
+def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>;
+def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
+def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
+def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
+def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
+
+
+def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
+                                     (ops PRED_SEL_OFF)>;
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+
+// Class for instructions with only one source register.
+// If you add new ins to this instruction, make sure they are listed before
+// $literal, because the backend currently assumes that the last operand is
+// a literal.  Also be sure to update the enum R600Op1OperandIndex::ROI in
+// R600Defines.h, R600InstrInfo::buildDefaultInstruction(),
+// and R600InstrInfo::getOperandIdx().
+class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+    InstR600 <(outs R600_Reg32:$dst),
+              (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
+                   R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
+                   LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+                   BANK_SWIZZLE:$bank_swizzle),
+              !strconcat("  ", opName,
+                   "$clamp $last $dst$write$dst_rel$omod, "
+                   "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
+                   "$pred_sel $bank_swizzle"),
+              pattern,
+              itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <inst> {
+
+  let src1 = 0;
+  let src1_rel = 0;
+  let src1_neg = 0;
+  let src1_abs = 0;
+  let update_exec_mask = 0;
+  let update_pred = 0;
+  let HasNativeOperands = 1;
+  let Op1 = 1;
+  let ALUInst = 1;
+  let DisableEncoding = "$literal";
+  let UseNamedOperandTable = 1;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
+                    InstrItinClass itin = AnyALU> :
+    R600_1OP <inst, opName,
+              [(set R600_Reg32:$dst, (node R600_Reg32:$src0))], itin
+>;
+
+// If you add or change the operands for R600_2OP instructions, you must
+// also update the R600Op2OperandIndex::ROI enum in R600Defines.h,
+// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx().
+class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <(outs R600_Reg32:$dst),
+          (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
+               OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
+               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
+               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel,
+               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+               BANK_SWIZZLE:$bank_swizzle),
+          !strconcat("  ", opName,
+                "$clamp $last $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
+                "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
+                "$src1_neg$src1_abs$src1$src1_abs$src1_rel, "
+                "$pred_sel $bank_swizzle"),
+          pattern,
+          itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <inst> {
+
+  let HasNativeOperands = 1;
+  let Op2 = 1;
+  let ALUInst = 1;
+  let DisableEncoding = "$literal";
+  let UseNamedOperandTable = 1;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
+                       InstrItinClass itin = AnyALU> :
+    R600_2OP <inst, opName,
+              [(set R600_Reg32:$dst, (node R600_Reg32:$src0,
+                                           R600_Reg32:$src1))], itin
+>;
+
+// If you add our change the operands for R600_3OP instructions, you must
+// also update the R600Op3OperandIndex::ROI enum in R600Defines.h,
+// R600InstrInfo::buildDefaultInstruction(), and
+// R600InstrInfo::getOperandIdx().
+class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <(outs R600_Reg32:$dst),
+          (ins REL:$dst_rel, CLAMP:$clamp,
+               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel,
+               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel,
+               R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel,
+               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+               BANK_SWIZZLE:$bank_swizzle),
+          !strconcat("  ", opName, "$clamp $last $dst$dst_rel, "
+                             "$src0_neg$src0$src0_rel, "
+                             "$src1_neg$src1$src1_rel, "
+                             "$src2_neg$src2$src2_rel, "
+                             "$pred_sel"
+                             "$bank_swizzle"),
+          pattern,
+          itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP3<inst>{
+
+  let HasNativeOperands = 1;
+  let DisableEncoding = "$literal";
+  let Op3 = 1;
+  let UseNamedOperandTable = 1;
+  let ALUInst = 1;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
+                      InstrItinClass itin = VecALU> :
+  InstR600 <(outs R600_Reg32:$dst),
+          ins,
+          asm,
+          pattern,
+          itin>;
+
+
+
+} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
+
+def TEX_SHADOW : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return (TType >= 6 && TType <= 8) || TType == 13;
+  }]
+>;
+
+def TEX_RECT : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 5;
+  }]
+>;
+
+def TEX_ARRAY : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 9 || TType == 10 || TType == 16;
+  }]
+>;
+
+def TEX_SHADOW_ARRAY : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 11 || TType == 12 || TType == 17;
+  }]
+>;
+
+def TEX_MSAA : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 14;
+  }]
+>;
+
+def TEX_ARRAY_MSAA : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 15;
+  }]
+>;
+
+class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
+                 dag outs, dag ins, string asm, list<dag> pattern> :
+    InstR600ISA <outs, ins, asm, pattern>,
+    CF_ALLOC_EXPORT_WORD0_RAT, CF_ALLOC_EXPORT_WORD1_BUF  {
+
+  let rat_id = ratid;
+  let rat_inst = ratinst;
+  let rim         = 0;
+  // XXX: Have a separate instruction for non-indexed writes.
+  let type        = 1;
+  let rw_rel      = 0;
+  let elem_size   = 0;
+
+  let array_size  = 0;
+  let comp_mask   = mask;
+  let burst_count = 0;
+  let vpm         = 0;
+  let cf_inst = cfinst;
+  let mark        = 0;
+  let barrier     = 1;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+  let IsExport = 1;
+
+}
+
+class VTX_READ <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
+    : InstR600ISA <outs, (ins MEMxi:$src_gpr), name, pattern>,
+      VTX_WORD1_GPR {
+
+  // Static fields
+  let DST_REL = 0;
+  // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL,
+  // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored,
+  // however, based on my testing if USE_CONST_FIELDS is set, then all
+  // these fields need to be set to 0.
+  let USE_CONST_FIELDS = 0;
+  let NUM_FORMAT_ALL = 1;
+  let FORMAT_COMP_ALL = 0;
+  let SRF_MODE_ALL = 0;
+
+  let Inst{63-32} = Word1;
+  // LLVM can only encode 64-bit instructions, so these fields are manually
+  // encoded in R600CodeEmitter
+  //
+  // bits<16> OFFSET;
+  // bits<2>  ENDIAN_SWAP = 0;
+  // bits<1>  CONST_BUF_NO_STRIDE = 0;
+  // bits<1>  MEGA_FETCH = 0;
+  // bits<1>  ALT_CONST = 0;
+  // bits<2>  BUFFER_INDEX_MODE = 0;
+
+  // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+  // is done in R600CodeEmitter
+  //
+  // Inst{79-64} = OFFSET;
+  // Inst{81-80} = ENDIAN_SWAP;
+  // Inst{82}    = CONST_BUF_NO_STRIDE;
+  // Inst{83}    = MEGA_FETCH;
+  // Inst{84}    = ALT_CONST;
+  // Inst{86-85} = BUFFER_INDEX_MODE;
+  // Inst{95-86} = 0; Reserved
+
+  // VTX_WORD3 (Padding)
+  //
+  // Inst{127-96} = 0;
+
+  let VTXInst = 1;
+}
+
+class LoadParamFrag <PatFrag load_type> : PatFrag <
+  (ops node:$ptr), (load_type node:$ptr),
+  [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), 0); }]
+>;
+
+def load_param : LoadParamFrag<load>;
+def load_param_exti8 : LoadParamFrag<az_extloadi8>;
+def load_param_exti16 : LoadParamFrag<az_extloadi16>;
+
+def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">;
+
+def isR600toCayman
+    : Predicate<
+          "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
+
+//===----------------------------------------------------------------------===//
+// R600 SDNodes
+//===----------------------------------------------------------------------===//
+
+def INTERP_PAIR_XY :  AMDGPUShaderInst <
+  (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1),
+  (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
+  "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1",
+  []>;
+
+def INTERP_PAIR_ZW :  AMDGPUShaderInst <
+  (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1),
+  (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
+  "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1",
+  []>;
+
+def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
+  SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
+  [SDNPVariadic]
+>;
+
+def DOT4 : SDNode<"AMDGPUISD::DOT4",
+  SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>,
+      SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>,
+      SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>,
+  []
+>;
+
+def COS_HW : SDNode<"AMDGPUISD::COS_HW",
+  SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>
+>;
+
+def SIN_HW : SDNode<"AMDGPUISD::SIN_HW",
+  SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>
+>;
+
+def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>;
+
+def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>;
+
+multiclass TexPattern<bits<32> TextureOp, Instruction inst, ValueType vt = v4f32> {
+def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
+          (i32 imm:$srcx), (i32 imm:$srcy), (i32 imm:$srcz), (i32 imm:$srcw),
+          (i32 imm:$offsetx), (i32 imm:$offsety), (i32 imm:$offsetz),
+          (i32 imm:$DST_SEL_X), (i32 imm:$DST_SEL_Y), (i32 imm:$DST_SEL_Z),
+          (i32 imm:$DST_SEL_W),
+          (i32 imm:$RESOURCE_ID), (i32 imm:$SAMPLER_ID),
+          (i32 imm:$COORD_TYPE_X), (i32 imm:$COORD_TYPE_Y), (i32 imm:$COORD_TYPE_Z),
+          (i32 imm:$COORD_TYPE_W)),
+          (inst R600_Reg128:$SRC_GPR,
+          imm:$srcx, imm:$srcy, imm:$srcz, imm:$srcw,
+          imm:$offsetx, imm:$offsety, imm:$offsetz,
+          imm:$DST_SEL_X, imm:$DST_SEL_Y, imm:$DST_SEL_Z,
+          imm:$DST_SEL_W,
+          imm:$RESOURCE_ID, imm:$SAMPLER_ID,
+          imm:$COORD_TYPE_X, imm:$COORD_TYPE_Y, imm:$COORD_TYPE_Z,
+          imm:$COORD_TYPE_W)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Interpolation Instructions
+//===----------------------------------------------------------------------===//
+
+def INTERP_VEC_LOAD :  AMDGPUShaderInst <
+  (outs R600_Reg128:$dst),
+  (ins i32imm:$src0),
+  "INTERP_LOAD $src0 : $dst",
+  [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>;
+
+def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
+  let bank_swizzle = 5;
+}
+
+def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> {
+  let bank_swizzle = 5;
+}
+
+def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>;
+
+//===----------------------------------------------------------------------===//
+// Export Instructions
+//===----------------------------------------------------------------------===//
+
+def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
+
+def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType,
+  [SDNPHasChain, SDNPSideEffect]>;
+
+class ExportWord0 {
+  field bits<32> Word0;
+
+  bits<13> arraybase;
+  bits<2> type;
+  bits<7> gpr;
+  bits<2> elem_size;
+
+  let Word0{12-0} = arraybase;
+  let Word0{14-13} = type;
+  let Word0{21-15} = gpr;
+  let Word0{22} = 0; // RW_REL
+  let Word0{29-23} = 0; // INDEX_GPR
+  let Word0{31-30} = elem_size;
+}
+
+class ExportSwzWord1 {
+  field bits<32> Word1;
+
+  bits<3> sw_x;
+  bits<3> sw_y;
+  bits<3> sw_z;
+  bits<3> sw_w;
+  bits<1> eop;
+  bits<8> inst;
+
+  let Word1{2-0} = sw_x;
+  let Word1{5-3} = sw_y;
+  let Word1{8-6} = sw_z;
+  let Word1{11-9} = sw_w;
+}
+
+class ExportBufWord1 {
+  field bits<32> Word1;
+
+  bits<12> arraySize;
+  bits<4> compMask;
+  bits<1> eop;
+  bits<8> inst;
+
+  let Word1{11-0} = arraySize;
+  let Word1{15-12} = compMask;
+}
+
+multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
+  def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg),
+    (ExportInst
+        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0),
+        0, 61, 0, 7, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg),
+    (ExportInst
+        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0),
+        0, 61, 7, 0, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(int_R600_store_dummy (i32 imm:$type)),
+    (ExportInst
+        (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(int_R600_store_dummy 1),
+    (ExportInst
+        (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
+    (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)),
+        (ExportInst R600_Reg128:$src, imm:$type, imm:$base,
+        imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0)
+  >;
+
+}
+
+multiclass SteamOutputExportPattern<Instruction ExportInst,
+    bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> {
+// Stream0
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+      4095, imm:$mask, buf0inst, 0)>;
+// Stream1
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
+      (ExportInst $src, 0, imm:$arraybase,
+      4095, imm:$mask, buf1inst, 0)>;
+// Stream2
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
+      (ExportInst $src, 0, imm:$arraybase,
+      4095, imm:$mask, buf2inst, 0)>;
+// Stream3
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
+      (ExportInst $src, 0, imm:$arraybase,
+      4095, imm:$mask, buf3inst, 0)>;
+}
+
+// Export Instructions should not be duplicated by TailDuplication pass
+// (which assumes that duplicable instruction are affected by exec mask)
+let usesCustomInserter = 1, isNotDuplicable = 1 in {
+
+class ExportSwzInst : InstR600ISA<(
+    outs),
+    (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
+    RSel:$sw_x, RSel:$sw_y, RSel:$sw_z, RSel:$sw_w, i32imm:$inst,
+    i32imm:$eop),
+    !strconcat("EXPORT", " $gpr.$sw_x$sw_y$sw_z$sw_w"),
+    []>, ExportWord0, ExportSwzWord1 {
+  let elem_size = 3;
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+  let IsExport = 1;
+}
+
+} // End usesCustomInserter = 1
+
+class ExportBufInst : InstR600ISA<(
+    outs),
+    (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
+    i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop),
+    !strconcat("EXPORT", " $gpr"),
+    []>, ExportWord0, ExportBufWord1 {
+  let elem_size = 0;
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+  let IsExport = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions
+//===----------------------------------------------------------------------===//
+
+
+def KCACHE : InstFlag<"printKCache">;
+
+class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs),
+(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1,
+KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1,
+i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1,
+i32imm:$COUNT, i32imm:$Enabled),
+!strconcat(OpName, " $COUNT, @$ADDR, "
+"KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"),
+[] >, CF_ALU_WORD0, CF_ALU_WORD1 {
+  field bits<64> Inst;
+
+  let CF_INST = inst;
+  let ALT_CONST = 0;
+  let WHOLE_QUAD_MODE = 0;
+  let BARRIER = 1;
+  let isCodeGenOnly = 1;
+  let UseNamedOperandTable = 1;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class CF_WORD0_R600 {
+  field bits<32> Word0;
+
+  bits<32> ADDR;
+
+  let Word0 = ADDR;
+}
+
+class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
+ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 {
+  field bits<64> Inst;
+  bits<4> CNT;
+
+  let CF_INST = inst;
+  let BARRIER = 1;
+  let CF_CONST = 0;
+  let VALID_PIXEL_MODE = 0;
+  let COND = 0;
+  let COUNT = CNT{2-0};
+  let CALL_COUNT = 0;
+  let COUNT_3 = CNT{3};
+  let END_OF_PROGRAM = 0;
+  let WHOLE_QUAD_MODE = 0;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
+ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG {
+  field bits<64> Inst;
+
+  let CF_INST = inst;
+  let BARRIER = 1;
+  let JUMPTABLE_SEL = 0;
+  let CF_CONST = 0;
+  let VALID_PIXEL_MODE = 0;
+  let COND = 0;
+  let END_OF_PROGRAM = 0;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+def CF_ALU : ALU_CLAUSE<8, "ALU">;
+def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">;
+def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">;
+def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">;
+def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">;
+def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">;
+
+def FETCH_CLAUSE : AMDGPUInst <(outs),
+(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > {
+  field bits<8> Inst;
+  bits<8> num;
+  let Inst = num;
+  let isCodeGenOnly = 1;
+}
+
+def ALU_CLAUSE : AMDGPUInst <(outs),
+(ins i32imm:$addr), "ALU clause starting at $addr:", [] > {
+  field bits<8> Inst;
+  bits<8> num;
+  let Inst = num;
+  let isCodeGenOnly = 1;
+}
+
+def LITERALS : AMDGPUInst <(outs),
+(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > {
+  let isCodeGenOnly = 1;
+
+  field bits<64> Inst;
+  bits<32> literal1;
+  bits<32> literal2;
+
+  let Inst{31-0} = literal1;
+  let Inst{63-32} = literal2;
+}
+
+def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > {
+  field bits<64> Inst;
+}
+
+let Predicates = [isR600toCayman] in {
+
+//===----------------------------------------------------------------------===//
+// Common Instructions R600, R700, Evergreen, Cayman
+//===----------------------------------------------------------------------===//
+
+def ADD : R600_2OP_Helper <0x0, "ADD", fadd>;
+// Non-IEEE MUL: 0 * anything = 0
+def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>;
+def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>;
+// TODO: Do these actually match the regular fmin/fmax behavior?
+def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>;
+def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>;
+// According to https://msdn.microsoft.com/en-us/library/windows/desktop/cc308050%28v=vs.85%29.aspx
+// DX10 min/max returns the other operand if one is NaN,
+// this matches http://llvm.org/docs/LangRef.html#llvm-minnum-intrinsic
+def MAX_DX10 : R600_2OP_Helper <0x5, "MAX_DX10", fmaxnum>;
+def MIN_DX10 : R600_2OP_Helper <0x6, "MIN_DX10", fminnum>;
+
+// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td,
+// so some of the instruction names don't match the asm string.
+// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
+def SETE : R600_2OP <
+  0x08, "SETE",
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))]
+>;
+
+def SGT : R600_2OP <
+  0x09, "SETGT",
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))]
+>;
+
+def SGE : R600_2OP <
+  0xA, "SETGE",
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))]
+>;
+
+def SNE : R600_2OP <
+  0xB, "SETNE",
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))]
+>;
+
+def SETE_DX10 : R600_2OP <
+  0xC, "SETE_DX10",
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OEQ))]
+>;
+
+def SETGT_DX10 : R600_2OP <
+  0xD, "SETGT_DX10",
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGT))]
+>;
+
+def SETGE_DX10 : R600_2OP <
+  0xE, "SETGE_DX10",
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))]
+>;
+
+// FIXME: This should probably be COND_ONE
+def SETNE_DX10 : R600_2OP <
+  0xF, "SETNE_DX10",
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))]
+>;
+
+def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
+def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>;
+def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
+def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>;
+def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
+
+def MOV : R600_1OP <0x19, "MOV", []>;
+
+let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
+
+class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <
+  (outs R600_Reg32:$dst),
+  (ins immType:$imm),
+  "",
+  []
+>;
+
+} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
+
+def MOV_IMM_I32 : MOV_IMM<i32, i32imm>;
+def : Pat <
+  (imm:$val),
+  (MOV_IMM_I32 imm:$val)
+>;
+
+def MOV_IMM_F32 : MOV_IMM<f32, f32imm>;
+def : Pat <
+  (fpimm:$val),
+  (MOV_IMM_F32  fpimm:$val)
+>;
+
+def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>;
+def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>;
+def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>;
+def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>;
+
+let hasSideEffects = 1 in {
+
+def KILLGT : R600_2OP <0x2D, "KILLGT", []>;
+
+} // end hasSideEffects
+
+def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>;
+def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>;
+def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>;
+def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>;
+def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>;
+def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>;
+def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", smax>;
+def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", smin>;
+def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", umax>;
+def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", umin>;
+
+def SETE_INT : R600_2OP <
+  0x3A, "SETE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETEQ))]
+>;
+
+def SETGT_INT : R600_2OP <
+  0x3B, "SETGT_INT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGT))]
+>;
+
+def SETGE_INT : R600_2OP <
+  0x3C, "SETGE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGE))]
+>;
+
+def SETNE_INT : R600_2OP <
+  0x3D, "SETNE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETNE))]
+>;
+
+def SETGT_UINT : R600_2OP <
+  0x3E, "SETGT_UINT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGT))]
+>;
+
+def SETGE_UINT : R600_2OP <
+  0x3F, "SETGE_UINT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGE))]
+>;
+
+def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>;
+def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>;
+def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>;
+def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>;
+
+def CNDE_INT : R600_3OP <
+  0x1C, "CNDE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_EQ))]
+>;
+
+def CNDGE_INT : R600_3OP <
+  0x1E, "CNDGE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGE))]
+>;
+
+def CNDGT_INT : R600_3OP <
+  0x1D, "CNDGT_INT",
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGT))]
+>;
+
+//===----------------------------------------------------------------------===//
+// Texture instructions
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+
+class R600_TEX <bits<11> inst, string opName> :
+  InstR600 <(outs R600_Reg128:$DST_GPR),
+          (ins R600_Reg128:$SRC_GPR,
+          RSel:$srcx, RSel:$srcy, RSel:$srcz, RSel:$srcw,
+          i32imm:$offsetx, i32imm:$offsety, i32imm:$offsetz,
+          RSel:$DST_SEL_X, RSel:$DST_SEL_Y, RSel:$DST_SEL_Z, RSel:$DST_SEL_W,
+          i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID,
+          CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z,
+          CT:$COORD_TYPE_W),
+          !strconcat(opName,
+          " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, "
+          "$SRC_GPR.$srcx$srcy$srcz$srcw "
+          "RID:$RESOURCE_ID SID:$SAMPLER_ID "
+          "CT:$COORD_TYPE_X$COORD_TYPE_Y$COORD_TYPE_Z$COORD_TYPE_W"),
+          [],
+          NullALU>, TEX_WORD0, TEX_WORD1, TEX_WORD2 {
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+
+  let TEX_INST = inst{4-0};
+  let SRC_REL = 0;
+  let DST_REL = 0;
+  let LOD_BIAS = 0;
+
+  let INST_MOD = 0;
+  let FETCH_WHOLE_QUAD = 0;
+  let ALT_CONST = 0;
+  let SAMPLER_INDEX_MODE = 0;
+  let RESOURCE_INDEX_MODE = 0;
+
+  let TEXInst = 1;
+}
+
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
+
+
+
+def TEX_SAMPLE : R600_TEX <0x10, "TEX_SAMPLE">;
+def TEX_SAMPLE_C : R600_TEX <0x18, "TEX_SAMPLE_C">;
+def TEX_SAMPLE_L : R600_TEX <0x11, "TEX_SAMPLE_L">;
+def TEX_SAMPLE_C_L : R600_TEX <0x19, "TEX_SAMPLE_C_L">;
+def TEX_SAMPLE_LB : R600_TEX <0x12, "TEX_SAMPLE_LB">;
+def TEX_SAMPLE_C_LB : R600_TEX <0x1A, "TEX_SAMPLE_C_LB">;
+def TEX_LD : R600_TEX <0x03, "TEX_LD">;
+def TEX_LDPTR : R600_TEX <0x03, "TEX_LDPTR"> {
+  let INST_MOD = 1;
+}
+def TEX_GET_TEXTURE_RESINFO : R600_TEX <0x04, "TEX_GET_TEXTURE_RESINFO">;
+def TEX_GET_GRADIENTS_H : R600_TEX <0x07, "TEX_GET_GRADIENTS_H">;
+def TEX_GET_GRADIENTS_V : R600_TEX <0x08, "TEX_GET_GRADIENTS_V">;
+def TEX_SET_GRADIENTS_H : R600_TEX <0x0B, "TEX_SET_GRADIENTS_H">;
+def TEX_SET_GRADIENTS_V : R600_TEX <0x0C, "TEX_SET_GRADIENTS_V">;
+def TEX_SAMPLE_G : R600_TEX <0x14, "TEX_SAMPLE_G">;
+def TEX_SAMPLE_C_G : R600_TEX <0x1C, "TEX_SAMPLE_C_G">;
+
+defm : TexPattern<0, TEX_SAMPLE>;
+defm : TexPattern<1, TEX_SAMPLE_C>;
+defm : TexPattern<2, TEX_SAMPLE_L>;
+defm : TexPattern<3, TEX_SAMPLE_C_L>;
+defm : TexPattern<4, TEX_SAMPLE_LB>;
+defm : TexPattern<5, TEX_SAMPLE_C_LB>;
+defm : TexPattern<6, TEX_LD, v4i32>;
+defm : TexPattern<7, TEX_GET_TEXTURE_RESINFO, v4i32>;
+defm : TexPattern<8, TEX_GET_GRADIENTS_H>;
+defm : TexPattern<9, TEX_GET_GRADIENTS_V>;
+defm : TexPattern<10, TEX_LDPTR, v4i32>;
+
+//===----------------------------------------------------------------------===//
+// Helper classes for common instructions
+//===----------------------------------------------------------------------===//
+
+class MUL_LIT_Common <bits<5> inst> : R600_3OP <
+  inst, "MUL_LIT",
+  []
+>;
+
+class MULADD_Common <bits<5> inst> : R600_3OP <
+  inst, "MULADD",
+  []
+>;
+
+class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
+  inst, "MULADD_IEEE",
+  [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))]
+>;
+
+class FMA_Common <bits<5> inst> : R600_3OP <
+  inst, "FMA",
+  [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU
+>;
+
+class CNDE_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDE",
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))]
+>;
+
+class CNDGT_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDGT",
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))]
+> {
+  let Itinerary = VecALU;
+}
+
+class CNDGE_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDGE",
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))]
+> {
+  let Itinerary = VecALU;
+}
+
+
+let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
+// Slot X
+   UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
+   OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X,
+   R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X,
+   R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X,
+   R600_Pred:$pred_sel_X,
+// Slot Y
+   UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y,
+   OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y,
+   R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y,
+   R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y,
+   R600_Pred:$pred_sel_Y,
+// Slot Z
+   UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z,
+   OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z,
+   R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z,
+   R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z,
+   R600_Pred:$pred_sel_Z,
+// Slot W
+   UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W,
+   OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W,
+   R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W,
+   R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W,
+   R600_Pred:$pred_sel_W,
+   LITERAL:$literal0, LITERAL:$literal1),
+  "",
+  pattern,
+  AnyALU> {
+
+  let UseNamedOperandTable = 1;
+
+}
+}
+
+def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4
+  R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X,
+  R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y,
+  R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z,
+  R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>;
+
+
+class DOT4_Common <bits<11> inst> : R600_2OP <inst, "DOT4", []>;
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+multiclass CUBE_Common <bits<11> inst> {
+
+  def _pseudo : InstR600 <
+    (outs R600_Reg128:$dst),
+    (ins R600_Reg128:$src0),
+    "CUBE $dst $src0",
+    [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))],
+    VecALU
+  > {
+    let isPseudo = 1;
+    let UseNamedOperandTable = 1;
+  }
+
+  def _real : R600_2OP <inst, "CUBE", []>;
+}
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
+
+class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "EXP_IEEE", fexp2
+> {
+  let Itinerary = TransALU;
+}
+
+class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "FLT_TO_INT", fp_to_sint
+> {
+  let Itinerary = TransALU;
+}
+
+class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "INT_TO_FLT", sint_to_fp
+> {
+  let Itinerary = TransALU;
+}
+
+class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "FLT_TO_UINT", fp_to_uint
+> {
+  let Itinerary = TransALU;
+}
+
+class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "UINT_TO_FLT", uint_to_fp
+> {
+  let Itinerary = TransALU;
+}
+
+class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
+  inst, "LOG_CLAMPED", []
+>;
+
+class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "LOG_IEEE", flog2
+> {
+  let Itinerary = TransALU;
+}
+
+class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>;
+class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>;
+class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
+class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULHI_INT", mulhs
+> {
+  let Itinerary = TransALU;
+}
+class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULHI", mulhu
+> {
+  let Itinerary = TransALU;
+}
+class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULLO_INT", mul
+> {
+  let Itinerary = TransALU;
+}
+class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> {
+  let Itinerary = TransALU;
+}
+
+class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
+  inst, "RECIP_CLAMPED", []
+> {
+  let Itinerary = TransALU;
+}
+
+class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
+  inst, "RECIP_IEEE", [(set f32:$dst, (AMDGPUrcp f32:$src0))]
+> {
+  let Itinerary = TransALU;
+}
+
+class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "RECIP_UINT", AMDGPUurecip
+> {
+  let Itinerary = TransALU;
+}
+
+// Clamped to maximum.
+class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped
+> {
+  let Itinerary = TransALU;
+}
+
+class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy
+> {
+  let Itinerary = TransALU;
+}
+
+// TODO: There is also RECIPSQRT_FF which clamps to zero.
+
+class SIN_Common <bits<11> inst> : R600_1OP <
+  inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{
+  let Trig = 1;
+  let Itinerary = TransALU;
+}
+
+class COS_Common <bits<11> inst> : R600_1OP <
+  inst, "COS", [(set f32:$dst, (COS_HW f32:$src0))]> {
+  let Trig = 1;
+  let Itinerary = TransALU;
+}
+
+def CLAMP_R600 :  CLAMP <R600_Reg32>;
+def FABS_R600 : FABS<R600_Reg32>;
+def FNEG_R600 : FNEG<R600_Reg32>;
+
+//===----------------------------------------------------------------------===//
+// Helper patterns for complex intrinsics
+//===----------------------------------------------------------------------===//
+
+// FIXME: Should be predicated on unsafe fp math.
+multiclass DIV_Common <InstR600 recip_ieee> {
+def : Pat<
+  (int_AMDGPU_div f32:$src0, f32:$src1),
+  (MUL_IEEE $src0, (recip_ieee $src1))
+>;
+
+def : Pat<
+  (fdiv f32:$src0, f32:$src1),
+  (MUL_IEEE $src0, (recip_ieee $src1))
+>;
+
+def : RcpPat<recip_ieee, f32>;
+}
+
+class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee>
+  : Pat <
+  (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w),
+  (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x))
+>;
+
+//===----------------------------------------------------------------------===//
+// R600 / R700 Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isR600] in {
+
+  def MUL_LIT_r600 : MUL_LIT_Common<0x0C>;
+  def MULADD_r600 : MULADD_Common<0x10>;
+  def MULADD_IEEE_r600 : MULADD_IEEE_Common<0x14>;
+  def CNDE_r600 : CNDE_Common<0x18>;
+  def CNDGT_r600 : CNDGT_Common<0x19>;
+  def CNDGE_r600 : CNDGE_Common<0x1A>;
+  def DOT4_r600 : DOT4_Common<0x50>;
+  defm CUBE_r600 : CUBE_Common<0x52>;
+  def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
+  def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
+  def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>;
+  def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>;
+  def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>;
+  def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>;
+  def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>;
+  def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>;
+  def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>;
+  def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>;
+  def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>;
+  def SIN_r600 : SIN_Common<0x6E>;
+  def COS_r600 : COS_Common<0x6F>;
+  def ASHR_r600 : ASHR_Common<0x70>;
+  def LSHR_r600 : LSHR_Common<0x71>;
+  def LSHL_r600 : LSHL_Common<0x72>;
+  def MULLO_INT_r600 : MULLO_INT_Common<0x73>;
+  def MULHI_INT_r600 : MULHI_INT_Common<0x74>;
+  def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>;
+  def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>;
+  def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>;
+
+  defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
+  def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
+  def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
+
+  def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
+  def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
+
+  def R600_ExportSwz : ExportSwzInst {
+    let Word1{20-17} = 0; // BURST_COUNT
+    let Word1{21} = eop;
+    let Word1{22} = 0; // VALID_PIXEL_MODE
+    let Word1{30-23} = inst;
+    let Word1{31} = 1; // BARRIER
+  }
+  defm : ExportPattern<R600_ExportSwz, 39>;
+
+  def R600_ExportBuf : ExportBufInst {
+    let Word1{20-17} = 0; // BURST_COUNT
+    let Word1{21} = eop;
+    let Word1{22} = 0; // VALID_PIXEL_MODE
+    let Word1{30-23} = inst;
+    let Word1{31} = 1; // BARRIER
+  }
+  defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>;
+
+  def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$CNT),
+  "TEX $CNT @$ADDR"> {
+    let POP_COUNT = 0;
+  }
+  def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$CNT),
+  "VTX $CNT @$ADDR"> {
+    let POP_COUNT = 0;
+  }
+  def WHILE_LOOP_R600 : CF_CLAUSE_R600<6, (ins i32imm:$ADDR),
+  "LOOP_START_DX10 @$ADDR"> {
+    let POP_COUNT = 0;
+    let CNT = 0;
+  }
+  def END_LOOP_R600 : CF_CLAUSE_R600<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
+    let POP_COUNT = 0;
+    let CNT = 0;
+  }
+  def LOOP_BREAK_R600 : CF_CLAUSE_R600<9, (ins i32imm:$ADDR),
+  "LOOP_BREAK @$ADDR"> {
+    let POP_COUNT = 0;
+    let CNT = 0;
+  }
+  def CF_CONTINUE_R600 : CF_CLAUSE_R600<8, (ins i32imm:$ADDR),
+  "CONTINUE @$ADDR"> {
+    let POP_COUNT = 0;
+    let CNT = 0;
+  }
+  def CF_JUMP_R600 : CF_CLAUSE_R600<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "JUMP @$ADDR POP:$POP_COUNT"> {
+    let CNT = 0;
+  }
+  def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR),
+  "PUSH_ELSE @$ADDR"> {
+    let CNT = 0;
+    let POP_COUNT = 0; // FIXME?
+  }
+  def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "ELSE @$ADDR POP:$POP_COUNT"> {
+    let CNT = 0;
+  }
+  def CF_CALL_FS_R600 : CF_CLAUSE_R600<19, (ins), "CALL_FS"> {
+    let ADDR = 0;
+    let CNT = 0;
+    let POP_COUNT = 0;
+  }
+  def POP_R600 : CF_CLAUSE_R600<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "POP @$ADDR POP:$POP_COUNT"> {
+    let CNT = 0;
+  }
+  def CF_END_R600 : CF_CLAUSE_R600<0, (ins), "CF_END"> {
+    let CNT = 0;
+    let POP_COUNT = 0;
+    let ADDR = 0;
+    let END_OF_PROGRAM = 1;
+  }
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// Regist loads and stores - for indirect addressing
+//===----------------------------------------------------------------------===//
+
+defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
+
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let isPseudo = 1 in {
+
+def PRED_X : InstR600 <
+  (outs R600_Predicate_Bit:$dst),
+  (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
+  "", [], NullALU> {
+  let FlagOperandIdx = 3;
+}
+
+let isTerminator = 1, isBranch = 1 in {
+def JUMP_COND : InstR600 <
+          (outs),
+          (ins brtarget:$target, R600_Predicate_Bit:$p),
+          "JUMP $target ($p)",
+          [], AnyALU
+  >;
+
+def JUMP : InstR600 <
+          (outs),
+          (ins brtarget:$target),
+          "JUMP $target",
+          [], AnyALU
+  >
+{
+  let isPredicable = 1;
+  let isBarrier = 1;
+}
+
+}  // End isTerminator = 1, isBranch = 1
+
+let usesCustomInserter = 1 in {
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
+
+def MASK_WRITE : AMDGPUShaderInst <
+    (outs),
+    (ins R600_Reg32:$src),
+    "MASK_WRITE $src",
+    []
+>;
+
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
+
+
+def TXD: InstR600 <
+  (outs R600_Reg128:$dst),
+  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
+       i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+  "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
+  [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
+                     imm:$resourceId, imm:$samplerId, imm:$textureTarget))],
+  NullALU > {
+  let TEXInst = 1;
+}
+
+def TXD_SHADOW: InstR600 <
+  (outs R600_Reg128:$dst),
+  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
+       i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+  "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
+  [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
+        imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))],
+   NullALU
+> {
+  let TEXInst = 1;
+}
+} // End isPseudo = 1
+} // End usesCustomInserter = 1
+
+
+//===----------------------------------------------------------------------===//
+// Constant Buffer Addressing Support
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+def CONST_COPY : Instruction {
+  let OutOperandList = (outs R600_Reg32:$dst);
+  let InOperandList = (ins i32imm:$src);
+  let Pattern =
+      [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
+  let AsmString = "CONST_COPY";
+  let hasSideEffects = 0;
+  let isAsCheapAsAMove = 1;
+  let Itinerary = NullALU;
+}
+} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
+
+def TEX_VTX_CONSTBUF :
+  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr",
+      [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>,
+  VTX_WORD1_GPR, VTX_WORD0_eg {
+
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let SRC_REL = 0;
+  let SRC_SEL_X = 0;
+  let DST_REL = 0;
+  let USE_CONST_FIELDS = 0;
+  let NUM_FORMAT_ALL = 2;
+  let FORMAT_COMP_ALL = 1;
+  let SRF_MODE_ALL = 1;
+  let MEGA_FETCH_COUNT = 16;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 2;
+  let DST_SEL_W        = 3;
+  let DATA_FORMAT      = 35;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+
+// LLVM can only encode 64-bit instructions, so these fields are manually
+// encoded in R600CodeEmitter
+//
+// bits<16> OFFSET;
+// bits<2>  ENDIAN_SWAP = 0;
+// bits<1>  CONST_BUF_NO_STRIDE = 0;
+// bits<1>  MEGA_FETCH = 0;
+// bits<1>  ALT_CONST = 0;
+// bits<2>  BUFFER_INDEX_MODE = 0;
+
+
+
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+// is done in R600CodeEmitter
+//
+// Inst{79-64} = OFFSET;
+// Inst{81-80} = ENDIAN_SWAP;
+// Inst{82}    = CONST_BUF_NO_STRIDE;
+// Inst{83}    = MEGA_FETCH;
+// Inst{84}    = ALT_CONST;
+// Inst{86-85} = BUFFER_INDEX_MODE;
+// Inst{95-86} = 0; Reserved
+
+// VTX_WORD3 (Padding)
+//
+// Inst{127-96} = 0;
+  let VTXInst = 1;
+}
+
+def TEX_VTX_TEXBUF:
+  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr",
+      [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>,
+VTX_WORD1_GPR, VTX_WORD0_eg {
+
+let VC_INST = 0;
+let FETCH_TYPE = 2;
+let FETCH_WHOLE_QUAD = 0;
+let SRC_REL = 0;
+let SRC_SEL_X = 0;
+let DST_REL = 0;
+let USE_CONST_FIELDS = 1;
+let NUM_FORMAT_ALL = 0;
+let FORMAT_COMP_ALL = 0;
+let SRF_MODE_ALL = 1;
+let MEGA_FETCH_COUNT = 16;
+let DST_SEL_X        = 0;
+let DST_SEL_Y        = 1;
+let DST_SEL_Z        = 2;
+let DST_SEL_W        = 3;
+let DATA_FORMAT      = 0;
+
+let Inst{31-0} = Word0;
+let Inst{63-32} = Word1;
+
+// LLVM can only encode 64-bit instructions, so these fields are manually
+// encoded in R600CodeEmitter
+//
+// bits<16> OFFSET;
+// bits<2>  ENDIAN_SWAP = 0;
+// bits<1>  CONST_BUF_NO_STRIDE = 0;
+// bits<1>  MEGA_FETCH = 0;
+// bits<1>  ALT_CONST = 0;
+// bits<2>  BUFFER_INDEX_MODE = 0;
+
+
+
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+// is done in R600CodeEmitter
+//
+// Inst{79-64} = OFFSET;
+// Inst{81-80} = ENDIAN_SWAP;
+// Inst{82}    = CONST_BUF_NO_STRIDE;
+// Inst{83}    = MEGA_FETCH;
+// Inst{84}    = ALT_CONST;
+// Inst{86-85} = BUFFER_INDEX_MODE;
+// Inst{95-86} = 0; Reserved
+
+// VTX_WORD3 (Padding)
+//
+// Inst{127-96} = 0;
+  let VTXInst = 1;
+}
+
+//===---------------------------------------------------------------------===//
+// Flow and Program control Instructions
+//===---------------------------------------------------------------------===//
+class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+     let Namespace = "AMDGPU";
+     dag OutOperandList = outs;
+     dag InOperandList = ins;
+     let Pattern = pattern;
+     let AsmString = !strconcat(asmstr, "\n");
+     let isPseudo = 1;
+     let Itinerary = NullALU;
+     bit hasIEEEFlag = 0;
+     bit hasZeroOpFlag = 0;
+     let mayLoad = 0;
+     let mayStore = 0;
+     let hasSideEffects = 0;
+     let isCodeGenOnly = 1;
+}
+
+multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
+    def _i32 : ILFormat<(outs),
+  (ins brtarget:$target, rci:$src0),
+        "; i32 Pseudo branch instruction",
+  [(Op bb:$target, (i32 rci:$src0))]>;
+    def _f32 : ILFormat<(outs),
+  (ins brtarget:$target, rcf:$src0),
+        "; f32 Pseudo branch instruction",
+  [(Op bb:$target, (f32 rcf:$src0))]>;
+}
+
+// Only scalar types should generate flow control
+multiclass BranchInstr<string name> {
+  def _i32 : ILFormat<(outs), (ins R600_Reg32:$src),
+      !strconcat(name, " $src"), []>;
+  def _f32 : ILFormat<(outs), (ins R600_Reg32:$src),
+      !strconcat(name, " $src"), []>;
+}
+// Only scalar types should generate flow control
+multiclass BranchInstr2<string name> {
+  def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
+      !strconcat(name, " $src0, $src1"), []>;
+  def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
+      !strconcat(name, " $src0, $src1"), []>;
+}
+
+//===---------------------------------------------------------------------===//
+// Custom Inserter for Branches and returns, this eventually will be a
+// separate pass
+//===---------------------------------------------------------------------===//
+let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
+  def BRANCH : ILFormat<(outs), (ins brtarget:$target),
+      "; Pseudo unconditional branch instruction",
+      [(br bb:$target)]>;
+  defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>;
+}
+
+//===---------------------------------------------------------------------===//
+// Return instruction
+//===---------------------------------------------------------------------===//
+let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
+    usesCustomInserter = 1 in {
+  def RETURN          : ILFormat<(outs), (ins variable_ops),
+      "RETURN", [(IL_retflag)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Instructions
+//===----------------------------------------------------------------------===//
+
+def IF_PREDICATE_SET  : ILFormat<(outs), (ins R600_Reg32:$src),
+  "IF_PREDICATE_SET $src", []>;
+
+let isTerminator=1 in {
+  def BREAK       : ILFormat< (outs), (ins),
+      "BREAK", []>;
+  def CONTINUE    : ILFormat< (outs), (ins),
+      "CONTINUE", []>;
+  def DEFAULT     : ILFormat< (outs), (ins),
+      "DEFAULT", []>;
+  def ELSE        : ILFormat< (outs), (ins),
+      "ELSE", []>;
+  def ENDSWITCH   : ILFormat< (outs), (ins),
+      "ENDSWITCH", []>;
+  def ENDMAIN     : ILFormat< (outs), (ins),
+      "ENDMAIN", []>;
+  def END         : ILFormat< (outs), (ins),
+      "END", []>;
+  def ENDFUNC     : ILFormat< (outs), (ins),
+      "ENDFUNC", []>;
+  def ENDIF       : ILFormat< (outs), (ins),
+      "ENDIF", []>;
+  def WHILELOOP   : ILFormat< (outs), (ins),
+      "WHILE", []>;
+  def ENDLOOP     : ILFormat< (outs), (ins),
+      "ENDLOOP", []>;
+  def FUNC        : ILFormat< (outs), (ins),
+      "FUNC", []>;
+  def RETDYN      : ILFormat< (outs), (ins),
+      "RET_DYN", []>;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm IF_LOGICALNZ  : BranchInstr<"IF_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm IF_LOGICALZ   : BranchInstr<"IF_LOGICALZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">;
+  defm IFC         : BranchInstr2<"IFC">;
+  defm BREAKC      : BranchInstr2<"BREAKC">;
+  defm CONTINUEC   : BranchInstr2<"CONTINUEC">;
+}
+
+//===----------------------------------------------------------------------===//
+// Indirect addressing pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let isPseudo = 1 in {
+
+class ExtractVertical <RegisterClass vec_rc> : InstR600 <
+  (outs R600_Reg32:$dst),
+  (ins vec_rc:$vec, R600_Reg32:$index), "",
+  [],
+  AnyALU
+>;
+
+let Constraints = "$dst = $vec" in {
+
+class InsertVertical <RegisterClass vec_rc> : InstR600 <
+  (outs vec_rc:$dst),
+  (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "",
+  [],
+  AnyALU
+>;
+
+} // End Constraints = "$dst = $vec"
+
+} // End isPseudo = 1
+
+def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>;
+def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>;
+
+def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>;
+def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>;
+
+class ExtractVerticalPat <Instruction inst, ValueType vec_ty,
+                          ValueType scalar_ty> : Pat <
+  (scalar_ty (extractelt vec_ty:$vec, i32:$index)),
+  (inst $vec, $index)
+>;
+
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>;
+
+class InsertVerticalPat <Instruction inst, ValueType vec_ty,
+                         ValueType scalar_ty> : Pat <
+  (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)),
+  (inst $vec, $value, $index)
+>;
+
+def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
+
+//===----------------------------------------------------------------------===//
+// ISel Patterns
+//===----------------------------------------------------------------------===//
+
+// CND*_INT Pattterns for f32 True / False values
+
+class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat <
+  (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc),
+  (cnd $src0, $src1, $src2)
+>;
+
+def : CND_INT_f32 <CNDE_INT,  SETEQ>;
+def : CND_INT_f32 <CNDGT_INT, SETGT>;
+def : CND_INT_f32 <CNDGE_INT, SETGE>;
+
+//CNDGE_INT extra pattern
+def : Pat <
+  (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT),
+  (CNDGE_INT $src0, $src1, $src2)
+>;
+
+// KIL Patterns
+def KILP : Pat <
+  (int_AMDGPU_kilp),
+  (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
+>;
+
+def KIL : Pat <
+  (int_AMDGPU_kill f32:$src0),
+  (MASK_WRITE (KILLGT (f32 ZERO), $src0))
+>;
+
+def : Extract_Element <f32, v4f32, 0, sub0>;
+def : Extract_Element <f32, v4f32, 1, sub1>;
+def : Extract_Element <f32, v4f32, 2, sub2>;
+def : Extract_Element <f32, v4f32, 3, sub3>;
+
+def : Insert_Element <f32, v4f32, 0, sub0>;
+def : Insert_Element <f32, v4f32, 1, sub1>;
+def : Insert_Element <f32, v4f32, 2, sub2>;
+def : Insert_Element <f32, v4f32, 3, sub3>;
+
+def : Extract_Element <i32, v4i32, 0, sub0>;
+def : Extract_Element <i32, v4i32, 1, sub1>;
+def : Extract_Element <i32, v4i32, 2, sub2>;
+def : Extract_Element <i32, v4i32, 3, sub3>;
+
+def : Insert_Element <i32, v4i32, 0, sub0>;
+def : Insert_Element <i32, v4i32, 1, sub1>;
+def : Insert_Element <i32, v4i32, 2, sub2>;
+def : Insert_Element <i32, v4i32, 3, sub3>;
+
+def : Extract_Element <f32, v2f32, 0, sub0>;
+def : Extract_Element <f32, v2f32, 1, sub1>;
+
+def : Insert_Element <f32, v2f32, 0, sub0>;
+def : Insert_Element <f32, v2f32, 1, sub1>;
+
+def : Extract_Element <i32, v2i32, 0, sub0>;
+def : Extract_Element <i32, v2i32, 1, sub1>;
+
+def : Insert_Element <i32, v2i32, 0, sub0>;
+def : Insert_Element <i32, v2i32, 1, sub1>;
+
+// bitconvert patterns
+
+def : BitConvert <i32, f32, R600_Reg32>;
+def : BitConvert <f32, i32, R600_Reg32>;
+def : BitConvert <v2f32, v2i32, R600_Reg64>;
+def : BitConvert <v2i32, v2f32, R600_Reg64>;
+def : BitConvert <v4f32, v4i32, R600_Reg128>;
+def : BitConvert <v4i32, v4f32, R600_Reg128>;
+
+// DWORDADDR pattern
+def : DwordAddrPat  <i32, R600_Reg32>;
+
+} // End isR600toCayman Predicate
+
+let Predicates = [isR600] in {
+// Intrinsic patterns
+defm : Expand24IBitOps<MULLO_INT_r600, ADD_INT>;
+defm : Expand24UBitOps<MULLO_UINT_r600, ADD_INT>;
+} // End isR600
+
+def getLDSNoRetOp : InstrMapping {
+  let FilterClass = "R600_LDS_1A1D";
+  let RowFields = ["BaseOp"];
+  let ColFields = ["DisableEncoding"];
+  let KeyCol = ["$dst"];
+  let ValueCols = [[""""]];
+}
diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td
new file mode 100644
index 00000000000..9681747006d
--- /dev/null
+++ b/lib/Target/AMDGPU/R600Intrinsics.td
@@ -0,0 +1,75 @@
+//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 Intrinsic Definitions
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "R600", isTarget = 1 in {
+  class TextureIntrinsicFloatInput :
+    Intrinsic<[llvm_v4f32_ty], [
+      llvm_v4f32_ty, // Coord
+      llvm_i32_ty, // offset_x
+      llvm_i32_ty, // offset_y,
+      llvm_i32_ty, // offset_z,
+      llvm_i32_ty, // resource_id
+      llvm_i32_ty, // samplerid
+      llvm_i32_ty, // coord_type_x
+      llvm_i32_ty, // coord_type_y
+      llvm_i32_ty, // coord_type_z
+      llvm_i32_ty // coord_type_w
+    ], [IntrNoMem]>;
+  class TextureIntrinsicInt32Input :
+    Intrinsic<[llvm_v4i32_ty], [
+      llvm_v4i32_ty, // Coord
+      llvm_i32_ty, // offset_x
+      llvm_i32_ty, // offset_y,
+      llvm_i32_ty, // offset_z,
+      llvm_i32_ty, // resource_id
+      llvm_i32_ty, // samplerid
+      llvm_i32_ty, // coord_type_x
+      llvm_i32_ty, // coord_type_y
+      llvm_i32_ty, // coord_type_z
+      llvm_i32_ty // coord_type_w
+    ], [IntrNoMem]>;
+
+  def int_R600_load_input :
+    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_interp_input :
+    Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_interp_const :
+    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_R600_interp_xy :
+    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+def int_R600_interp_zw :
+    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_R600_load_texbuf :
+    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_tex : TextureIntrinsicFloatInput;
+  def int_R600_texc : TextureIntrinsicFloatInput;
+  def int_R600_txl : TextureIntrinsicFloatInput;
+  def int_R600_txlc : TextureIntrinsicFloatInput;
+  def int_R600_txb : TextureIntrinsicFloatInput;
+  def int_R600_txbc : TextureIntrinsicFloatInput;
+  def int_R600_txf : TextureIntrinsicInt32Input;
+  def int_R600_ldptr : TextureIntrinsicInt32Input;
+  def int_R600_txq : TextureIntrinsicInt32Input;
+  def int_R600_ddx : TextureIntrinsicFloatInput;
+  def int_R600_ddy : TextureIntrinsicFloatInput;
+  def int_R600_store_swizzle :
+    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_R600_store_stream_output :
+    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_R600_store_pixel_depth :
+      Intrinsic<[], [llvm_float_ty], []>;
+  def int_R600_store_pixel_stencil :
+      Intrinsic<[], [llvm_float_ty], []>;
+  def int_R600_store_dummy :
+      Intrinsic<[], [llvm_i32_ty], []>;
+}
diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
new file mode 100644
index 00000000000..01105c614c5
--- /dev/null
+++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
@@ -0,0 +1,20 @@
+//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "R600MachineFunctionInfo.h"
+
+using namespace llvm;
+
+
+// Pin the vtable to this file.
+void R600MachineFunctionInfo::anchor() {}
+
+R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
+  : AMDGPUMachineFunction(MF) { }
diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/lib/Target/AMDGPU/R600MachineFunctionInfo.h
new file mode 100644
index 00000000000..263561edd30
--- /dev/null
+++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.h
@@ -0,0 +1,34 @@
+//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H
+
+#include "AMDGPUMachineFunction.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include <vector>
+
+namespace llvm {
+
+class R600MachineFunctionInfo : public AMDGPUMachineFunction {
+  void anchor() override;
+public:
+  R600MachineFunctionInfo(const MachineFunction &MF);
+  SmallVector<unsigned, 4> LiveOuts;
+  std::vector<unsigned> IndirectRegs;
+  unsigned StackSize;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
new file mode 100644
index 00000000000..bcde5fb50da
--- /dev/null
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -0,0 +1,469 @@
+//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600MachineScheduler.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
+  assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
+  DAG = static_cast<ScheduleDAGMILive*>(dag);
+  const AMDGPUSubtarget &ST = DAG->MF.getSubtarget<AMDGPUSubtarget>();
+  TII = static_cast<const R600InstrInfo*>(DAG->TII);
+  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
+  VLIW5 = !ST.hasCaymanISA();
+  MRI = &DAG->MRI;
+  CurInstKind = IDOther;
+  CurEmitted = 0;
+  OccupedSlotsMask = 31;
+  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
+  InstKindLimit[IDOther] = 32;
+  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
+  AluInstCount = 0;
+  FetchInstCount = 0;
+}
+
+void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
+                                  std::vector<SUnit *> &QDst)
+{
+  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
+  QSrc.clear();
+}
+
+static
+unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+  assert (GPRCount && "GPRCount cannot be 0");
+  return 248 / GPRCount;
+}
+
+SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
+  SUnit *SU = nullptr;
+  NextInstKind = IDOther;
+
+  IsTopNode = false;
+
+  // check if we might want to switch current clause type
+  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
+      (Available[CurInstKind].empty());
+  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
+      (!Available[IDFetch].empty() || !Available[IDOther].empty());
+
+  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
+    // We use the heuristic provided by AMD Accelerated Parallel Processing
+    // OpenCL Programming Guide :
+    // The approx. number of WF that allows TEX inst to hide ALU inst is :
+    // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
+    float ALUFetchRationEstimate =
+        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
+        (FetchInstCount + Available[IDFetch].size());
+    if (ALUFetchRationEstimate == 0) {
+      AllowSwitchFromAlu = true;
+    } else {
+      unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
+      DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+      // We assume the local GPR requirements to be "dominated" by the requirement
+      // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
+      // after TEX are indeed likely to consume or generate values from/for the
+      // TEX clause.
+      // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
+      // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
+      // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
+      // (TODO : use RegisterPressure)
+      // If we are going too use too many GPR, we flush Fetch instruction to lower
+      // register pressure on 128 bits regs.
+      unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
+      if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+        AllowSwitchFromAlu = true;
+    }
+  }
+
+  if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
+      (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
+    // try to pick ALU
+    SU = pickAlu();
+    if (!SU && !PhysicalRegCopy.empty()) {
+      SU = PhysicalRegCopy.front();
+      PhysicalRegCopy.erase(PhysicalRegCopy.begin());
+    }
+    if (SU) {
+      if (CurEmitted >= InstKindLimit[IDAlu])
+        CurEmitted = 0;
+      NextInstKind = IDAlu;
+    }
+  }
+
+  if (!SU) {
+    // try to pick FETCH
+    SU = pickOther(IDFetch);
+    if (SU)
+      NextInstKind = IDFetch;
+  }
+
+  // try to pick other
+  if (!SU) {
+    SU = pickOther(IDOther);
+    if (SU)
+      NextInstKind = IDOther;
+  }
+
+  DEBUG(
+      if (SU) {
+        dbgs() << " ** Pick node **\n";
+        SU->dump(DAG);
+      } else {
+        dbgs() << "NO NODE \n";
+        for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
+          const SUnit &S = DAG->SUnits[i];
+          if (!S.isScheduled)
+            S.dump(DAG);
+        }
+      }
+  );
+
+  return SU;
+}
+
+void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+  if (NextInstKind != CurInstKind) {
+    DEBUG(dbgs() << "Instruction Type Switch\n");
+    if (NextInstKind != IDAlu)
+      OccupedSlotsMask |= 31;
+    CurEmitted = 0;
+    CurInstKind = NextInstKind;
+  }
+
+  if (CurInstKind == IDAlu) {
+    AluInstCount ++;
+    switch (getAluKind(SU)) {
+    case AluT_XYZW:
+      CurEmitted += 4;
+      break;
+    case AluDiscarded:
+      break;
+    default: {
+      ++CurEmitted;
+      for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
+          E = SU->getInstr()->operands_end(); It != E; ++It) {
+        MachineOperand &MO = *It;
+        if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+          ++CurEmitted;
+      }
+    }
+    }
+  } else {
+    ++CurEmitted;
+  }
+
+
+  DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
+
+  if (CurInstKind != IDFetch) {
+    MoveUnits(Pending[IDFetch], Available[IDFetch]);
+  } else
+    FetchInstCount++;
+}
+
+static bool
+isPhysicalRegCopy(MachineInstr *MI) {
+  if (MI->getOpcode() != AMDGPU::COPY)
+    return false;
+
+  return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
+}
+
+void R600SchedStrategy::releaseTopNode(SUnit *SU) {
+  DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
+}
+
+void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
+  DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
+  if (isPhysicalRegCopy(SU->getInstr())) {
+    PhysicalRegCopy.push_back(SU);
+    return;
+  }
+
+  int IK = getInstKind(SU);
+
+  // There is no export clause, we can schedule one as soon as its ready
+  if (IK == IDOther)
+    Available[IDOther].push_back(SU);
+  else
+    Pending[IK].push_back(SU);
+
+}
+
+bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
+                                          const TargetRegisterClass *RC) const {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    return RC->contains(Reg);
+  } else {
+    return MRI->getRegClass(Reg) == RC;
+  }
+}
+
+R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
+  MachineInstr *MI = SU->getInstr();
+
+  if (TII->isTransOnly(MI))
+    return AluTrans;
+
+    switch (MI->getOpcode()) {
+    case AMDGPU::PRED_X:
+      return AluPredX;
+    case AMDGPU::INTERP_PAIR_XY:
+    case AMDGPU::INTERP_PAIR_ZW:
+    case AMDGPU::INTERP_VEC_LOAD:
+    case AMDGPU::DOT_4:
+      return AluT_XYZW;
+    case AMDGPU::COPY:
+      if (MI->getOperand(1).isUndef()) {
+        // MI will become a KILL, don't considers it in scheduling
+        return AluDiscarded;
+      }
+    default:
+      break;
+    }
+
+    // Does the instruction take a whole IG ?
+    // XXX: Is it possible to add a helper function in R600InstrInfo that can
+    // be used here and in R600PacketizerList::isSoloInstruction() ?
+    if(TII->isVector(*MI) ||
+        TII->isCubeOp(MI->getOpcode()) ||
+        TII->isReductionOp(MI->getOpcode()) ||
+        MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
+      return AluT_XYZW;
+    }
+
+    if (TII->isLDSInstr(MI->getOpcode())) {
+      return AluT_X;
+    }
+
+    // Is the result already assigned to a channel ?
+    unsigned DestSubReg = MI->getOperand(0).getSubReg();
+    switch (DestSubReg) {
+    case AMDGPU::sub0:
+      return AluT_X;
+    case AMDGPU::sub1:
+      return AluT_Y;
+    case AMDGPU::sub2:
+      return AluT_Z;
+    case AMDGPU::sub3:
+      return AluT_W;
+    default:
+      break;
+    }
+
+    // Is the result already member of a X/Y/Z/W class ?
+    unsigned DestReg = MI->getOperand(0).getReg();
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
+        regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
+      return AluT_X;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
+      return AluT_Y;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
+      return AluT_Z;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
+      return AluT_W;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
+      return AluT_XYZW;
+
+    // LDS src registers cannot be used in the Trans slot.
+    if (TII->readsLDSSrcReg(MI))
+      return AluT_XYZW;
+
+    return AluAny;
+
+}
+
+int R600SchedStrategy::getInstKind(SUnit* SU) {
+  int Opcode = SU->getInstr()->getOpcode();
+
+  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
+    return IDFetch;
+
+  if (TII->isALUInstr(Opcode)) {
+    return IDAlu;
+  }
+
+  switch (Opcode) {
+  case AMDGPU::PRED_X:
+  case AMDGPU::COPY:
+  case AMDGPU::CONST_COPY:
+  case AMDGPU::INTERP_PAIR_XY:
+  case AMDGPU::INTERP_PAIR_ZW:
+  case AMDGPU::INTERP_VEC_LOAD:
+  case AMDGPU::DOT_4:
+    return IDAlu;
+  default:
+    return IDOther;
+  }
+}
+
+SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
+  if (Q.empty())
+    return nullptr;
+  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
+      It != E; ++It) {
+    SUnit *SU = *It;
+    InstructionsGroupCandidate.push_back(SU->getInstr());
+    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)
+        && (!AnyALU || !TII->isVectorOnly(SU->getInstr()))
+    ) {
+      InstructionsGroupCandidate.pop_back();
+      Q.erase((It + 1).base());
+      return SU;
+    } else {
+      InstructionsGroupCandidate.pop_back();
+    }
+  }
+  return nullptr;
+}
+
+void R600SchedStrategy::LoadAlu() {
+  std::vector<SUnit *> &QSrc = Pending[IDAlu];
+  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
+    AluKind AK = getAluKind(QSrc[i]);
+    AvailableAlus[AK].push_back(QSrc[i]);
+  }
+  QSrc.clear();
+}
+
+void R600SchedStrategy::PrepareNextSlot() {
+  DEBUG(dbgs() << "New Slot\n");
+  assert (OccupedSlotsMask && "Slot wasn't filled");
+  OccupedSlotsMask = 0;
+//  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
+//    OccupedSlotsMask |= 16;
+  InstructionsGroupCandidate.clear();
+  LoadAlu();
+}
+
+void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
+  int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+  if (DstIndex == -1) {
+    return;
+  }
+  unsigned DestReg = MI->getOperand(DstIndex).getReg();
+  // PressureRegister crashes if an operand is def and used in the same inst
+  // and we try to constraint its regclass
+  for (MachineInstr::mop_iterator It = MI->operands_begin(),
+      E = MI->operands_end(); It != E; ++It) {
+    MachineOperand &MO = *It;
+    if (MO.isReg() && !MO.isDef() &&
+        MO.getReg() == DestReg)
+      return;
+  }
+  // Constrains the regclass of DestReg to assign it to Slot
+  switch (Slot) {
+  case 0:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
+    break;
+  case 1:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
+    break;
+  case 2:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
+    break;
+  case 3:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
+    break;
+  }
+}
+
+SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
+  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
+  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
+  if (SlotedSU)
+    return SlotedSU;
+  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
+  if (UnslotedSU)
+    AssignSlot(UnslotedSU->getInstr(), Slot);
+  return UnslotedSU;
+}
+
+unsigned R600SchedStrategy::AvailablesAluCount() const {
+  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
+      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
+      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
+      AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
+      AvailableAlus[AluPredX].size();
+}
+
+SUnit* R600SchedStrategy::pickAlu() {
+  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
+    if (!OccupedSlotsMask) {
+      // Bottom up scheduling : predX must comes first
+      if (!AvailableAlus[AluPredX].empty()) {
+        OccupedSlotsMask |= 31;
+        return PopInst(AvailableAlus[AluPredX], false);
+      }
+      // Flush physical reg copies (RA will discard them)
+      if (!AvailableAlus[AluDiscarded].empty()) {
+        OccupedSlotsMask |= 31;
+        return PopInst(AvailableAlus[AluDiscarded], false);
+      }
+      // If there is a T_XYZW alu available, use it
+      if (!AvailableAlus[AluT_XYZW].empty()) {
+        OccupedSlotsMask |= 15;
+        return PopInst(AvailableAlus[AluT_XYZW], false);
+      }
+    }
+    bool TransSlotOccuped = OccupedSlotsMask & 16;
+    if (!TransSlotOccuped && VLIW5) {
+      if (!AvailableAlus[AluTrans].empty()) {
+        OccupedSlotsMask |= 16;
+        return PopInst(AvailableAlus[AluTrans], false);
+      }
+      SUnit *SU = AttemptFillSlot(3, true);
+      if (SU) {
+        OccupedSlotsMask |= 16;
+        return SU;
+      }
+    }
+    for (int Chan = 3; Chan > -1; --Chan) {
+      bool isOccupied = OccupedSlotsMask & (1 << Chan);
+      if (!isOccupied) {
+        SUnit *SU = AttemptFillSlot(Chan, false);
+        if (SU) {
+          OccupedSlotsMask |= (1 << Chan);
+          InstructionsGroupCandidate.push_back(SU->getInstr());
+          return SU;
+        }
+      }
+    }
+    PrepareNextSlot();
+  }
+  return nullptr;
+}
+
+SUnit* R600SchedStrategy::pickOther(int QID) {
+  SUnit *SU = nullptr;
+  std::vector<SUnit *> &AQ = Available[QID];
+
+  if (AQ.empty()) {
+    MoveUnits(Pending[QID], AQ);
+  }
+  if (!AQ.empty()) {
+    SU = AQ.back();
+    AQ.resize(AQ.size() - 1);
+  }
+  return SU;
+}
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.h b/lib/Target/AMDGPU/R600MachineScheduler.h
new file mode 100644
index 00000000000..fc5b95c28e7
--- /dev/null
+++ b/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -0,0 +1,103 @@
+//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H
+
+#include "R600InstrInfo.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+class R600SchedStrategy : public MachineSchedStrategy {
+
+  const ScheduleDAGMILive *DAG;
+  const R600InstrInfo *TII;
+  const R600RegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+  enum InstKind {
+    IDAlu,
+    IDFetch,
+    IDOther,
+    IDLast
+  };
+
+  enum AluKind {
+    AluAny,
+    AluT_X,
+    AluT_Y,
+    AluT_Z,
+    AluT_W,
+    AluT_XYZW,
+    AluPredX,
+    AluTrans,
+    AluDiscarded, // LLVM Instructions that are going to be eliminated
+    AluLast
+  };
+
+  std::vector<SUnit *> Available[IDLast], Pending[IDLast];
+  std::vector<SUnit *> AvailableAlus[AluLast];
+  std::vector<SUnit *> PhysicalRegCopy;
+
+  InstKind CurInstKind;
+  int CurEmitted;
+  InstKind NextInstKind;
+
+  unsigned AluInstCount;
+  unsigned FetchInstCount;
+
+  int InstKindLimit[IDLast];
+
+  int OccupedSlotsMask;
+
+public:
+  R600SchedStrategy() :
+    DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) {
+  }
+
+  virtual ~R600SchedStrategy() {}
+
+  void initialize(ScheduleDAGMI *dag) override;
+  SUnit *pickNode(bool &IsTopNode) override;
+  void schedNode(SUnit *SU, bool IsTopNode) override;
+  void releaseTopNode(SUnit *SU) override;
+  void releaseBottomNode(SUnit *SU) override;
+
+private:
+  std::vector<MachineInstr *> InstructionsGroupCandidate;
+  bool VLIW5;
+
+  int getInstKind(SUnit *SU);
+  bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
+  AluKind getAluKind(SUnit *SU) const;
+  void LoadAlu();
+  unsigned AvailablesAluCount() const;
+  SUnit *AttemptFillSlot (unsigned Slot, bool AnyAlu);
+  void PrepareNextSlot();
+  SUnit *PopInst(std::vector<SUnit*> &Q, bool AnyALU);
+
+  void AssignSlot(MachineInstr *MI, unsigned Slot);
+  SUnit* pickAlu();
+  SUnit* pickOther(int QID);
+  void MoveUnits(std::vector<SUnit *> &QSrc, std::vector<SUnit *> &QDst);
+};
+
+} // namespace llvm
+
+#endif /* R600MACHINESCHEDULER_H_ */
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
new file mode 100644
index 00000000000..0c06ccc736d
--- /dev/null
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -0,0 +1,382 @@
+//===--------------------- R600MergeVectorRegisters.cpp -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass merges inputs of swizzeable instructions into vector sharing
+/// common data and/or have enough undef subreg using swizzle abilities.
+///
+/// For instance let's consider the following pseudo code :
+/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// ...
+/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3
+/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3
+///
+/// is turned into :
+/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// ...
+/// vreg7<def> = INSERT_SUBREG vreg4, sub3
+/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3
+///
+/// This allow regalloc to reduce register pressure for vector registers and
+/// to reduce MOV count.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vec-merger"
+
+namespace {
+
+static bool
+isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
+  for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg),
+      E = MRI.def_instr_end(); It != E; ++It) {
+    return (*It).isImplicitDef();
+  }
+  if (MRI.isReserved(Reg)) {
+    return false;
+  }
+  llvm_unreachable("Reg without a def");
+  return false;
+}
+
+class RegSeqInfo {
+public:
+  MachineInstr *Instr;
+  DenseMap<unsigned, unsigned> RegToChan;
+  std::vector<unsigned> UndefReg;
+  RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
+    assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE);
+    for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
+      MachineOperand &MO = Instr->getOperand(i);
+      unsigned Chan = Instr->getOperand(i + 1).getImm();
+      if (isImplicitlyDef(MRI, MO.getReg()))
+        UndefReg.push_back(Chan);
+      else
+        RegToChan[MO.getReg()] = Chan;
+    }
+  }
+  RegSeqInfo() {}
+
+  bool operator==(const RegSeqInfo &RSI) const {
+    return RSI.Instr == Instr;
+  }
+};
+
+class R600VectorRegMerger : public MachineFunctionPass {
+private:
+  MachineRegisterInfo *MRI;
+  const R600InstrInfo *TII;
+  bool canSwizzle(const MachineInstr &) const;
+  bool areAllUsesSwizzeable(unsigned Reg) const;
+  void SwizzleInput(MachineInstr &,
+      const std::vector<std::pair<unsigned, unsigned> > &) const;
+  bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *,
+      std::vector<std::pair<unsigned, unsigned> > &Remap) const;
+  bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
+      std::vector<std::pair<unsigned, unsigned> > &RemapChan);
+  bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
+      std::vector<std::pair<unsigned, unsigned> > &RemapChan);
+  MachineInstr *RebuildVector(RegSeqInfo *MI,
+      const RegSeqInfo *BaseVec,
+      const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const;
+  void RemoveMI(MachineInstr *);
+  void trackRSI(const RegSeqInfo &RSI);
+
+  typedef DenseMap<unsigned, std::vector<MachineInstr *> > InstructionSetMap;
+  DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq;
+  InstructionSetMap PreviousRegSeqByReg;
+  InstructionSetMap PreviousRegSeqByUndefCount;
+public:
+  static char ID;
+  R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
+  TII(nullptr) { }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  const char *getPassName() const override {
+    return "R600 Vector Registers Merge Pass";
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+};
+
+char R600VectorRegMerger::ID = 0;
+
+bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
+    const {
+  if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
+    return true;
+  switch (MI.getOpcode()) {
+  case AMDGPU::R600_ExportSwz:
+  case AMDGPU::EG_ExportSwz:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
+    RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned> > &Remap)
+    const {
+  unsigned CurrentUndexIdx = 0;
+  for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(),
+      E = ToMerge->RegToChan.end(); It != E; ++It) {
+    DenseMap<unsigned, unsigned>::const_iterator PosInUntouched =
+        Untouched->RegToChan.find((*It).first);
+    if (PosInUntouched != Untouched->RegToChan.end()) {
+      Remap.push_back(std::pair<unsigned, unsigned>
+          ((*It).second, (*PosInUntouched).second));
+      continue;
+    }
+    if (CurrentUndexIdx >= Untouched->UndefReg.size())
+      return false;
+    Remap.push_back(std::pair<unsigned, unsigned>
+        ((*It).second, Untouched->UndefReg[CurrentUndexIdx++]));
+  }
+
+  return true;
+}
+
+static
+unsigned getReassignedChan(
+    const std::vector<std::pair<unsigned, unsigned> > &RemapChan,
+    unsigned Chan) {
+  for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
+    if (RemapChan[j].first == Chan)
+      return RemapChan[j].second;
+  }
+  llvm_unreachable("Chan wasn't reassigned");
+}
+
+MachineInstr *R600VectorRegMerger::RebuildVector(
+    RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
+    const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
+  unsigned Reg = RSI->Instr->getOperand(0).getReg();
+  MachineBasicBlock::iterator Pos = RSI->Instr;
+  MachineBasicBlock &MBB = *Pos->getParent();
+  DebugLoc DL = Pos->getDebugLoc();
+
+  unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg();
+  DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
+  std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
+  for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
+      E = RSI->RegToChan.end(); It != E; ++It) {
+    unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned SubReg = (*It).first;
+    unsigned Swizzle = (*It).second;
+    unsigned Chan = getReassignedChan(RemapChan, Swizzle);
+
+    MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG),
+        DstReg)
+        .addReg(SrcVec)
+        .addReg(SubReg)
+        .addImm(Chan);
+    UpdatedRegToChan[SubReg] = Chan;
+    std::vector<unsigned>::iterator ChanPos =
+        std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan);
+    if (ChanPos != UpdatedUndef.end())
+      UpdatedUndef.erase(ChanPos);
+    assert(std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan) ==
+               UpdatedUndef.end() &&
+           "UpdatedUndef shouldn't contain Chan more than once!");
+    DEBUG(dbgs() << "    ->"; Tmp->dump(););
+    (void)Tmp;
+    SrcVec = DstReg;
+  }
+  Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg)
+      .addReg(SrcVec);
+  DEBUG(dbgs() << "    ->"; Pos->dump(););
+
+  DEBUG(dbgs() << "  Updating Swizzle:\n");
+  for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
+      E = MRI->use_instr_end(); It != E; ++It) {
+    DEBUG(dbgs() << "    ";(*It).dump(); dbgs() << "    ->");
+    SwizzleInput(*It, RemapChan);
+    DEBUG((*It).dump());
+  }
+  RSI->Instr->eraseFromParent();
+
+  // Update RSI
+  RSI->Instr = Pos;
+  RSI->RegToChan = UpdatedRegToChan;
+  RSI->UndefReg = UpdatedUndef;
+
+  return Pos;
+}
+
+void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
+  for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
+      E = PreviousRegSeqByReg.end(); It != E; ++It) {
+    std::vector<MachineInstr *> &MIs = (*It).second;
+    MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
+  }
+  for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(),
+      E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
+    std::vector<MachineInstr *> &MIs = (*It).second;
+    MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
+  }
+}
+
+void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
+    const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
+  unsigned Offset;
+  if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
+    Offset = 2;
+  else
+    Offset = 3;
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1;
+    for (unsigned j = 0, e = RemapChan.size(); j < e; j++) {
+      if (RemapChan[j].first == Swizzle) {
+        MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1);
+        break;
+      }
+    }
+  }
+}
+
+bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const {
+  for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
+      E = MRI->use_instr_end(); It != E; ++It) {
+    if (!canSwizzle(*It))
+      return false;
+  }
+  return true;
+}
+
+bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI,
+    RegSeqInfo &CompatibleRSI,
+    std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
+  for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(),
+      MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) {
+    if (!MOp->isReg())
+      continue;
+    if (PreviousRegSeqByReg[MOp->getReg()].empty())
+      continue;
+    for (MachineInstr *MI : PreviousRegSeqByReg[MOp->getReg()]) {
+      CompatibleRSI = PreviousRegSeq[MI];
+      if (RSI == CompatibleRSI)
+        continue;
+      if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
+    RegSeqInfo &CompatibleRSI,
+    std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
+  unsigned NeededUndefs = 4 - RSI.UndefReg.size();
+  if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
+    return false;
+  std::vector<MachineInstr *> &MIs =
+      PreviousRegSeqByUndefCount[NeededUndefs];
+  CompatibleRSI = PreviousRegSeq[MIs.back()];
+  tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
+  return true;
+}
+
+void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
+  for (DenseMap<unsigned, unsigned>::const_iterator
+  It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) {
+    PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr);
+  }
+  PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr);
+  PreviousRegSeq[RSI.Instr] = RSI;
+}
+
+bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
+  TII = static_cast<const R600InstrInfo *>(Fn.getSubtarget().getInstrInfo());
+  MRI = &(Fn.getRegInfo());
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    MachineBasicBlock *MB = MBB;
+    PreviousRegSeq.clear();
+    PreviousRegSeqByReg.clear();
+    PreviousRegSeqByUndefCount.clear();
+
+    for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
+         MII != MIIE; ++MII) {
+      MachineInstr *MI = MII;
+      if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) {
+        if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
+          unsigned Reg = MI->getOperand(1).getReg();
+          for (MachineRegisterInfo::def_instr_iterator
+               It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end();
+               It != E; ++It) {
+            RemoveMI(&(*It));
+          }
+        }
+        continue;
+      }
+
+
+      RegSeqInfo RSI(*MRI, MI);
+
+      // All uses of MI are swizzeable ?
+      unsigned Reg = MI->getOperand(0).getReg();
+      if (!areAllUsesSwizzeable(Reg))
+        continue;
+
+      DEBUG (dbgs() << "Trying to optimize ";
+          MI->dump();
+      );
+
+      RegSeqInfo CandidateRSI;
+      std::vector<std::pair<unsigned, unsigned> > RemapChan;
+      DEBUG(dbgs() << "Using common slots...\n";);
+      if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
+        // Remove CandidateRSI mapping
+        RemoveMI(CandidateRSI.Instr);
+        MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
+        trackRSI(RSI);
+        continue;
+      }
+      DEBUG(dbgs() << "Using free slots...\n";);
+      RemapChan.clear();
+      if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) {
+        RemoveMI(CandidateRSI.Instr);
+        MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
+        trackRSI(RSI);
+        continue;
+      }
+      //Failed to merge
+      trackRSI(RSI);
+    }
+  }
+  return false;
+}
+
+}
+
+llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) {
+  return new R600VectorRegMerger(tm);
+}
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
new file mode 100644
index 00000000000..deee5bc3997
--- /dev/null
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -0,0 +1,408 @@
+//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass implements instructions packetization for R600. It unsets isLast
+/// bit of instructions inside a bundle and substitutes src register with
+/// PreviousVector when applicable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Debug.h"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "packets"
+
+namespace {
+
+class R600Packetizer : public MachineFunctionPass {
+
+public:
+  static char ID;
+  R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  const char *getPassName() const override {
+    return "R600 Packetizer";
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+};
+char R600Packetizer::ID = 0;
+
+class R600PacketizerList : public VLIWPacketizerList {
+
+private:
+  const R600InstrInfo *TII;
+  const R600RegisterInfo &TRI;
+  bool VLIW5;
+  bool ConsideredInstUsesAlreadyWrittenVectorElement;
+
+  unsigned getSlot(const MachineInstr *MI) const {
+    return TRI.getHWRegChan(MI->getOperand(0).getReg());
+  }
+
+  /// \returns register to PV chan mapping for bundle/single instructions that
+  /// immediately precedes I.
+  DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I)
+      const {
+    DenseMap<unsigned, unsigned> Result;
+    I--;
+    if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle())
+      return Result;
+    MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
+    if (I->isBundle())
+      BI++;
+    int LastDstChan = -1;
+    do {
+      bool isTrans = false;
+      int BISlot = getSlot(BI);
+      if (LastDstChan >= BISlot)
+        isTrans = true;
+      LastDstChan = BISlot;
+      if (TII->isPredicated(BI))
+        continue;
+      int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
+      if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
+        continue;
+      int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst);
+      if (DstIdx == -1) {
+        continue;
+      }
+      unsigned Dst = BI->getOperand(DstIdx).getReg();
+      if (isTrans || TII->isTransOnly(BI)) {
+        Result[Dst] = AMDGPU::PS;
+        continue;
+      }
+      if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
+          BI->getOpcode() == AMDGPU::DOT4_eg) {
+        Result[Dst] = AMDGPU::PV_X;
+        continue;
+      }
+      if (Dst == AMDGPU::OQAP) {
+        continue;
+      }
+      unsigned PVReg = 0;
+      switch (TRI.getHWRegChan(Dst)) {
+      case 0:
+        PVReg = AMDGPU::PV_X;
+        break;
+      case 1:
+        PVReg = AMDGPU::PV_Y;
+        break;
+      case 2:
+        PVReg = AMDGPU::PV_Z;
+        break;
+      case 3:
+        PVReg = AMDGPU::PV_W;
+        break;
+      default:
+        llvm_unreachable("Invalid Chan");
+      }
+      Result[Dst] = PVReg;
+    } while ((++BI)->isBundledWithPred());
+    return Result;
+  }
+
+  void substitutePV(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PVs)
+      const {
+    unsigned Ops[] = {
+      AMDGPU::OpName::src0,
+      AMDGPU::OpName::src1,
+      AMDGPU::OpName::src2
+    };
+    for (unsigned i = 0; i < 3; i++) {
+      int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]);
+      if (OperandIdx < 0)
+        continue;
+      unsigned Src = MI->getOperand(OperandIdx).getReg();
+      const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src);
+      if (It != PVs.end())
+        MI->getOperand(OperandIdx).setReg(It->second);
+    }
+  }
+public:
+  // Ctor.
+  R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI)
+      : VLIWPacketizerList(MF, MLI, true),
+        TII(static_cast<const R600InstrInfo *>(
+            MF.getSubtarget().getInstrInfo())),
+        TRI(TII->getRegisterInfo()) {
+    VLIW5 = !MF.getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
+  }
+
+  // initPacketizerState - initialize some internal flags.
+  void initPacketizerState() override {
+    ConsideredInstUsesAlreadyWrittenVectorElement = false;
+  }
+
+  // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+  bool ignorePseudoInstruction(MachineInstr *MI,
+                               MachineBasicBlock *MBB) override {
+    return false;
+  }
+
+  // isSoloInstruction - return true if instruction MI can not be packetized
+  // with any other instruction, which means that MI itself is a packet.
+  bool isSoloInstruction(MachineInstr *MI) override {
+    if (TII->isVector(*MI))
+      return true;
+    if (!TII->isALUInstr(MI->getOpcode()))
+      return true;
+    if (MI->getOpcode() == AMDGPU::GROUP_BARRIER)
+      return true;
+    // XXX: This can be removed once the packetizer properly handles all the
+    // LDS instruction group restrictions.
+    if (TII->isLDSInstr(MI->getOpcode()))
+      return true;
+    return false;
+  }
+
+  // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
+  // together.
+  bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override {
+    MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
+    if (getSlot(MII) == getSlot(MIJ))
+      ConsideredInstUsesAlreadyWrittenVectorElement = true;
+    // Does MII and MIJ share the same pred_sel ?
+    int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
+        OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel);
+    unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0,
+        PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0;
+    if (PredI != PredJ)
+      return false;
+    if (SUJ->isSucc(SUI)) {
+      for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) {
+        const SDep &Dep = SUJ->Succs[i];
+        if (Dep.getSUnit() != SUI)
+          continue;
+        if (Dep.getKind() == SDep::Anti)
+          continue;
+        if (Dep.getKind() == SDep::Output)
+          if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg())
+            continue;
+        return false;
+      }
+    }
+
+    bool ARDef = TII->definesAddressRegister(MII) ||
+                 TII->definesAddressRegister(MIJ);
+    bool ARUse = TII->usesAddressRegister(MII) ||
+                 TII->usesAddressRegister(MIJ);
+    if (ARDef && ARUse)
+      return false;
+
+    return true;
+  }
+
+  // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+  // and SUJ.
+  bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override {
+    return false;
+  }
+
+  void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
+    unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
+    MI->getOperand(LastOp).setImm(Bit);
+  }
+
+  bool isBundlableWithCurrentPMI(MachineInstr *MI,
+                                 const DenseMap<unsigned, unsigned> &PV,
+                                 std::vector<R600InstrInfo::BankSwizzle> &BS,
+                                 bool &isTransSlot) {
+    isTransSlot = TII->isTransOnly(MI);
+    assert (!isTransSlot || VLIW5);
+
+    // Is the dst reg sequence legal ?
+    if (!isTransSlot && !CurrentPacketMIs.empty()) {
+      if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) {
+        if (ConsideredInstUsesAlreadyWrittenVectorElement  &&
+            !TII->isVectorOnly(MI) && VLIW5) {
+          isTransSlot = true;
+          DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump(););
+        }
+        else
+          return false;
+      }
+    }
+
+    // Are the Constants limitations met ?
+    CurrentPacketMIs.push_back(MI);
+    if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
+      DEBUG(
+        dbgs() << "Couldn't pack :\n";
+        MI->dump();
+        dbgs() << "with the following packets :\n";
+        for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
+          CurrentPacketMIs[i]->dump();
+          dbgs() << "\n";
+        }
+        dbgs() << "because of Consts read limitations\n";
+      );
+      CurrentPacketMIs.pop_back();
+      return false;
+    }
+
+    // Is there a BankSwizzle set that meet Read Port limitations ?
+    if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
+            PV, BS, isTransSlot)) {
+      DEBUG(
+        dbgs() << "Couldn't pack :\n";
+        MI->dump();
+        dbgs() << "with the following packets :\n";
+        for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
+          CurrentPacketMIs[i]->dump();
+          dbgs() << "\n";
+        }
+        dbgs() << "because of Read port limitations\n";
+      );
+      CurrentPacketMIs.pop_back();
+      return false;
+    }
+
+    // We cannot read LDS source registrs from the Trans slot.
+    if (isTransSlot && TII->readsLDSSrcReg(MI))
+      return false;
+
+    CurrentPacketMIs.pop_back();
+    return true;
+  }
+
+  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override {
+    MachineBasicBlock::iterator FirstInBundle =
+        CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front();
+    const DenseMap<unsigned, unsigned> &PV =
+        getPreviousVector(FirstInBundle);
+    std::vector<R600InstrInfo::BankSwizzle> BS;
+    bool isTransSlot;
+
+    if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) {
+      for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
+        MachineInstr *MI = CurrentPacketMIs[i];
+        unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+            AMDGPU::OpName::bank_swizzle);
+        MI->getOperand(Op).setImm(BS[i]);
+      }
+      unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+          AMDGPU::OpName::bank_swizzle);
+      MI->getOperand(Op).setImm(BS.back());
+      if (!CurrentPacketMIs.empty())
+        setIsLastBit(CurrentPacketMIs.back(), 0);
+      substitutePV(MI, PV);
+      MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI);
+      if (isTransSlot) {
+        endPacket(std::next(It)->getParent(), std::next(It));
+      }
+      return It;
+    }
+    endPacket(MI->getParent(), MI);
+    if (TII->isTransOnly(MI))
+      return MI;
+    return VLIWPacketizerList::addToPacket(MI);
+  }
+};
+
+bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+
+  // Instantiate the packetizer.
+  R600PacketizerList Packetizer(Fn, MLI);
+
+  // DFA state table should not be empty.
+  assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+
+  //
+  // Loop over all basic blocks and remove KILL pseudo-instructions
+  // These instructions confuse the dependence analysis. Consider:
+  // D0 = ...   (Insn 0)
+  // R0 = KILL R0, D0 (Insn 1)
+  // R0 = ... (Insn 2)
+  // Here, Insn 1 will result in the dependence graph not emitting an output
+  // dependence between Insn 0 and Insn 2. This can lead to incorrect
+  // packetization
+  //
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    MachineBasicBlock::iterator End = MBB->end();
+    MachineBasicBlock::iterator MI = MBB->begin();
+    while (MI != End) {
+      if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF ||
+          (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) {
+        MachineBasicBlock::iterator DeleteMI = MI;
+        ++MI;
+        MBB->erase(DeleteMI);
+        End = MBB->end();
+        continue;
+      }
+      ++MI;
+    }
+  }
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    // Find scheduling regions and schedule / packetize each region.
+    unsigned RemainingCount = MBB->size();
+    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
+        RegionEnd != MBB->begin();) {
+      // The next region starts above the previous region. Look backward in the
+      // instruction stream until we find the nearest boundary.
+      MachineBasicBlock::iterator I = RegionEnd;
+      for(;I != MBB->begin(); --I, --RemainingCount) {
+        if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn))
+          break;
+      }
+      I = MBB->begin();
+
+      // Skip empty scheduling regions.
+      if (I == RegionEnd) {
+        RegionEnd = std::prev(RegionEnd);
+        --RemainingCount;
+        continue;
+      }
+      // Skip regions with one instruction.
+      if (I == std::prev(RegionEnd)) {
+        RegionEnd = std::prev(RegionEnd);
+        continue;
+      }
+
+      Packetizer.PacketizeMIs(MBB, I, RegionEnd);
+      RegionEnd = I;
+    }
+  }
+
+  return true;
+
+}
+
+} // end anonymous namespace
+
+llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) {
+  return new R600Packetizer(tm);
+}
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp
new file mode 100644
index 00000000000..fb0359cfc65
--- /dev/null
+++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -0,0 +1,91 @@
+//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600RegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+
+using namespace llvm;
+
+R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() {
+  RCW.RegWeight = 0;
+  RCW.WeightLimit = 0;
+}
+
+BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+
+  const R600InstrInfo *TII =
+      static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  Reserved.set(AMDGPU::ZERO);
+  Reserved.set(AMDGPU::HALF);
+  Reserved.set(AMDGPU::ONE);
+  Reserved.set(AMDGPU::ONE_INT);
+  Reserved.set(AMDGPU::NEG_HALF);
+  Reserved.set(AMDGPU::NEG_ONE);
+  Reserved.set(AMDGPU::PV_X);
+  Reserved.set(AMDGPU::ALU_LITERAL_X);
+  Reserved.set(AMDGPU::ALU_CONST);
+  Reserved.set(AMDGPU::PREDICATE_BIT);
+  Reserved.set(AMDGPU::PRED_SEL_OFF);
+  Reserved.set(AMDGPU::PRED_SEL_ZERO);
+  Reserved.set(AMDGPU::PRED_SEL_ONE);
+  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
+
+  for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
+                        E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
+    Reserved.set(*I);
+  }
+
+  TII->reserveIndirectRegisters(Reserved, MF);
+
+  return Reserved;
+}
+
+unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
+  return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
+}
+
+unsigned R600RegisterInfo::getHWRegIndex(unsigned Reg) const {
+  return GET_REG_INDEX(getEncodingValue(Reg));
+}
+
+const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
+                                                                   MVT VT) const {
+  switch(VT.SimpleTy) {
+  default:
+  case MVT::i32: return &AMDGPU::R600_TReg32RegClass;
+  }
+}
+
+const RegClassWeight &R600RegisterInfo::getRegClassWeight(
+  const TargetRegisterClass *RC) const {
+  return RCW;
+}
+
+bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
+  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+
+  switch (Reg) {
+  case AMDGPU::OQAP:
+  case AMDGPU::OQBP:
+  case AMDGPU::AR_X:
+    return false;
+  default:
+    return true;
+  }
+}
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h
new file mode 100644
index 00000000000..9713e600a72
--- /dev/null
+++ b/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -0,0 +1,49 @@
+//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for R600RegisterInfo
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H
+#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H
+
+#include "AMDGPURegisterInfo.h"
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+
+struct R600RegisterInfo : public AMDGPURegisterInfo {
+  RegClassWeight RCW;
+
+  R600RegisterInfo();
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  /// \brief get the HW encoding for a register's channel.
+  unsigned getHWRegChan(unsigned reg) const;
+
+  unsigned getHWRegIndex(unsigned Reg) const override;
+
+  /// \brief get the register class of the specified type to use in the
+  /// CFGStructurizer
+  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
+
+  const RegClassWeight &
+    getRegClassWeight(const TargetRegisterClass *RC) const override;
+
+  // \returns true if \p Reg can be defined in one ALU caluse and used in another.
+  bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td
new file mode 100644
index 00000000000..cc667d985a8
--- /dev/null
+++ b/lib/Target/AMDGPU/R600RegisterInfo.td
@@ -0,0 +1,252 @@
+
+class R600Reg <string name, bits<16> encoding> : Register<name> {
+  let Namespace = "AMDGPU";
+  let HWEncoding = encoding;
+}
+
+class R600RegWithChan <string name, bits<9> sel, string chan> :
+    Register <name> {
+
+  field bits<2> chan_encoding = !if(!eq(chan, "X"), 0,
+                                !if(!eq(chan, "Y"), 1,
+                                !if(!eq(chan, "Z"), 2,
+                                !if(!eq(chan, "W"), 3, 0))));
+  let HWEncoding{8-0}  = sel;
+  let HWEncoding{10-9} = chan_encoding;
+  let Namespace = "AMDGPU";
+}
+
+class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
+    RegisterWithSubRegs<n, subregs> {
+  field bits<2> chan_encoding = 0;
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1, sub2, sub3];
+  let HWEncoding{8-0} = encoding{8-0};
+  let HWEncoding{10-9} = chan_encoding;
+}
+
+class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
+    RegisterWithSubRegs<n, subregs> {
+  field bits<2> chan_encoding = 0;
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = encoding;
+  let HWEncoding{8-0} = encoding{8-0};
+  let HWEncoding{10-9} = chan_encoding;
+}
+
+class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 <
+  "V"#lo#hi#"_"#chan,
+  [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)],
+  lo
+>;
+
+foreach Index = 0-127 in {
+  foreach Chan = [ "X", "Y", "Z", "W" ] in {
+    // 32-bit Temporary Registers
+    def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>;
+
+    // Indirect addressing offset registers
+    def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan,
+                                              Index, Chan>;
+  }
+  // 128-bit Temporary Registers
+  def T#Index#_XYZW : R600Reg_128 <"T"#Index#"",
+                                   [!cast<Register>("T"#Index#"_X"),
+                                    !cast<Register>("T"#Index#"_Y"),
+                                    !cast<Register>("T"#Index#"_Z"),
+                                    !cast<Register>("T"#Index#"_W")],
+                                   Index>;
+
+  def T#Index#_XY : R600Reg_64 <"T"#Index#"",
+                                   [!cast<Register>("T"#Index#"_X"),
+                                    !cast<Register>("T"#Index#"_Y")],
+                                   Index>;
+}
+
+foreach Chan = [ "X", "Y", "Z", "W"] in {
+
+  let chan_encoding = !if(!eq(Chan, "X"), 0,
+                      !if(!eq(Chan, "Y"), 1,
+                      !if(!eq(Chan, "Z"), 2,
+                      !if(!eq(Chan, "W"), 3, 0)))) in {
+    def V0123_#Chan : R600Reg_128 <"V0123_"#Chan,
+                                   [!cast<Register>("T0_"#Chan),
+                                    !cast<Register>("T1_"#Chan),
+                                    !cast<Register>("T2_"#Chan),
+                                    !cast<Register>("T3_"#Chan)],
+                                    0>;
+    def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>;
+    def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>;
+  }
+}
+
+
+// KCACHE_BANK0
+foreach Index = 159-128 in {
+  foreach Chan = [ "X", "Y", "Z", "W" ] in {
+    // 32-bit Temporary Registers
+    def KC0_#Index#_#Chan : R600RegWithChan <"KC0["#!add(Index,-128)#"]."#Chan, Index, Chan>;
+  }
+  // 128-bit Temporary Registers
+  def KC0_#Index#_XYZW : R600Reg_128 <"KC0["#!add(Index, -128)#"].XYZW",
+                                 [!cast<Register>("KC0_"#Index#"_X"),
+                                  !cast<Register>("KC0_"#Index#"_Y"),
+                                  !cast<Register>("KC0_"#Index#"_Z"),
+                                  !cast<Register>("KC0_"#Index#"_W")],
+                                 Index>;
+}
+
+// KCACHE_BANK1
+foreach Index = 191-160 in {
+  foreach Chan = [ "X", "Y", "Z", "W" ] in {
+    // 32-bit Temporary Registers
+    def KC1_#Index#_#Chan : R600RegWithChan <"KC1["#!add(Index,-160)#"]."#Chan, Index, Chan>;
+  }
+  // 128-bit Temporary Registers
+  def KC1_#Index#_XYZW : R600Reg_128 <"KC1["#!add(Index, -160)#"].XYZW",
+                                 [!cast<Register>("KC1_"#Index#"_X"),
+                                  !cast<Register>("KC1_"#Index#"_Y"),
+                                  !cast<Register>("KC1_"#Index#"_Z"),
+                                  !cast<Register>("KC1_"#Index#"_W")],
+                                 Index>;
+}
+
+
+// Array Base Register holding input in FS
+foreach Index = 448-480 in {
+  def ArrayBase#Index :  R600Reg<"ARRAY_BASE", Index>;
+}
+
+
+// Special Registers
+
+def OQA : R600Reg<"OQA", 219>;
+def OQB : R600Reg<"OQB", 220>;
+def OQAP : R600Reg<"OQAP", 221>;
+def OQBP : R600Reg<"OQAP", 222>;
+def LDS_DIRECT_A : R600Reg<"LDS_DIRECT_A", 223>;
+def LDS_DIRECT_B : R600Reg<"LDS_DIRECT_B", 224>;
+def ZERO : R600Reg<"0.0", 248>;
+def ONE : R600Reg<"1.0", 249>;
+def NEG_ONE : R600Reg<"-1.0", 249>;
+def ONE_INT : R600Reg<"1", 250>;
+def HALF : R600Reg<"0.5", 252>;
+def NEG_HALF : R600Reg<"-0.5", 252>;
+def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">;
+def ALU_LITERAL_Y : R600RegWithChan<"literal.y", 253, "Y">;
+def ALU_LITERAL_Z : R600RegWithChan<"literal.z", 253, "Z">;
+def ALU_LITERAL_W : R600RegWithChan<"literal.w", 253, "W">;
+def PV_X : R600RegWithChan<"PV.X", 254, "X">;
+def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">;
+def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">;
+def PV_W : R600RegWithChan<"PV.W", 254, "W">;
+def PS: R600Reg<"PS", 255>;
+def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
+def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
+def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
+def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
+def AR_X : R600Reg<"AR.x", 0>;
+
+def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
+                          (add (sequence "ArrayBase%u", 448, 480))>;
+// special registers for ALU src operands
+// const buffer reference, SRCx_SEL contains index
+def ALU_CONST : R600Reg<"CBuf", 0>;
+// interpolation param reference, SRCx_SEL contains index
+def ALU_PARAM : R600Reg<"Param", 0>;
+
+let isAllocatable = 0 in {
+
+def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>;
+
+// We only use Addr_[YZW] for vertical vectors.
+// FIXME if we add more vertical vector registers we will need to ad more
+// registers to these classes.
+def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>;
+def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>;
+def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>;
+
+def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
+  (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
+
+def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_X", 128, 159))>;
+
+def R600_KC0_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_Y", 128, 159))>;
+
+def R600_KC0_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_Z", 128, 159))>;
+
+def R600_KC0_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_W", 128, 159))>;
+
+def R600_KC0 : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (interleave R600_KC0_X, R600_KC0_Y,
+                                               R600_KC0_Z, R600_KC0_W)>;
+
+def R600_KC1_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_X", 160, 191))>;
+
+def R600_KC1_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_Y", 160, 191))>;
+
+def R600_KC1_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_Z", 160, 191))>;
+
+def R600_KC1_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_W", 160, 191))>;
+
+def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (interleave R600_KC1_X, R600_KC1_Y,
+                                               R600_KC1_Z, R600_KC1_W)>;
+
+} // End isAllocatable = 0
+
+def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_X", 0, 127), AR_X)>;
+
+def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_Y", 0, 127))>;
+
+def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_Z", 0, 127))>;
+
+def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_W", 0, 127))>;
+
+def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (interleave R600_TReg32_X, R600_TReg32_Y,
+                                               R600_TReg32_Z, R600_TReg32_W)>;
+
+def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
+    R600_TReg32,
+    R600_ArrayBase,
+    R600_Addr,
+    R600_KC0, R600_KC1,
+    ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
+    ALU_CONST, ALU_PARAM, OQAP
+    )>;
+
+def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
+    PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>;
+
+def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add
+    PREDICATE_BIT)>;
+
+def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
+                                (add (sequence "T%u_XYZW", 0, 127))> {
+  let CopyCost = -1;
+}
+
+def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
+  (add V0123_W, V0123_Z, V0123_Y, V0123_X)
+>;
+
+def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+                                (add (sequence "T%u_XY", 0, 63))>;
+
+def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+                                      (add V01_X, V01_Y, V01_Z, V01_W,
+                                           V23_X, V23_Y, V23_Z, V23_W)>;
diff --git a/lib/Target/AMDGPU/R600Schedule.td b/lib/Target/AMDGPU/R600Schedule.td
new file mode 100644
index 00000000000..df62bf85c0a
--- /dev/null
+++ b/lib/Target/AMDGPU/R600Schedule.td
@@ -0,0 +1,49 @@
+//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 has a VLIW architecture.  On pre-cayman cards there are 5 instruction
+// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS.  For cayman cards, the TRANS
+// slot has been removed. 
+//
+//===----------------------------------------------------------------------===//
+
+
+def ALU_X : FuncUnit;
+def ALU_Y : FuncUnit;
+def ALU_Z : FuncUnit;
+def ALU_W : FuncUnit;
+def TRANS : FuncUnit;
+
+def AnyALU : InstrItinClass;
+def VecALU : InstrItinClass;
+def TransALU : InstrItinClass;
+def XALU : InstrItinClass;
+
+def R600_VLIW5_Itin : ProcessorItineraries <
+  [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
+  [],
+  [
+    InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
+    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
+    InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>,
+    InstrItinData<XALU, [InstrStage<1, [ALU_X]>]>,
+    InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
+  ]
+>;
+
+def R600_VLIW4_Itin : ProcessorItineraries <
+  [ALU_X, ALU_Y, ALU_Z, ALU_W, ALU_NULL],
+  [],
+  [
+    InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
+    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
+    InstrItinData<TransALU, [InstrStage<1, [ALU_NULL]>]>,
+    InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
+  ]
+>;
diff --git a/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp b/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp
new file mode 100644
index 00000000000..2fc7b02f673
--- /dev/null
+++ b/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp
@@ -0,0 +1,303 @@
+//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass translates tgsi-like texture intrinsics into R600 texture
+/// closer to hardware intrinsics.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+
+using namespace llvm;
+
+namespace {
+class R600TextureIntrinsicsReplacer :
+    public FunctionPass, public InstVisitor<R600TextureIntrinsicsReplacer> {
+  static char ID;
+
+  Module *Mod;
+  Type *FloatType;
+  Type *Int32Type;
+  Type *V4f32Type;
+  Type *V4i32Type;
+  FunctionType *TexSign;
+  FunctionType *TexQSign;
+
+  void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD,
+                                      unsigned SrcSelect[4], unsigned CT[4],
+                                      bool &useShadowVariant) {
+    enum TextureTypes {
+      TEXTURE_1D = 1,
+      TEXTURE_2D,
+      TEXTURE_3D,
+      TEXTURE_CUBE,
+      TEXTURE_RECT,
+      TEXTURE_SHADOW1D,
+      TEXTURE_SHADOW2D,
+      TEXTURE_SHADOWRECT,
+      TEXTURE_1D_ARRAY,
+      TEXTURE_2D_ARRAY,
+      TEXTURE_SHADOW1D_ARRAY,
+      TEXTURE_SHADOW2D_ARRAY,
+      TEXTURE_SHADOWCUBE,
+      TEXTURE_2D_MSAA,
+      TEXTURE_2D_ARRAY_MSAA,
+      TEXTURE_CUBE_ARRAY,
+      TEXTURE_SHADOWCUBE_ARRAY
+    };
+
+    switch (TextureType) {
+    case 0:
+      useShadowVariant = false;
+      return;
+    case TEXTURE_RECT:
+    case TEXTURE_1D:
+    case TEXTURE_2D:
+    case TEXTURE_3D:
+    case TEXTURE_CUBE:
+    case TEXTURE_1D_ARRAY:
+    case TEXTURE_2D_ARRAY:
+    case TEXTURE_CUBE_ARRAY:
+    case TEXTURE_2D_MSAA:
+    case TEXTURE_2D_ARRAY_MSAA:
+      useShadowVariant = false;
+      break;
+    case TEXTURE_SHADOW1D:
+    case TEXTURE_SHADOW2D:
+    case TEXTURE_SHADOWRECT:
+    case TEXTURE_SHADOW1D_ARRAY:
+    case TEXTURE_SHADOW2D_ARRAY:
+    case TEXTURE_SHADOWCUBE:
+    case TEXTURE_SHADOWCUBE_ARRAY:
+      useShadowVariant = true;
+      break;
+    default:
+      llvm_unreachable("Unknow Texture Type");
+    }
+
+    if (TextureType == TEXTURE_RECT ||
+        TextureType == TEXTURE_SHADOWRECT) {
+      CT[0] = 0;
+      CT[1] = 0;
+    }
+
+    if (TextureType == TEXTURE_CUBE_ARRAY ||
+        TextureType == TEXTURE_SHADOWCUBE_ARRAY)
+      CT[2] = 0;
+
+    if (TextureType == TEXTURE_1D_ARRAY ||
+        TextureType == TEXTURE_SHADOW1D_ARRAY) {
+      if (hasLOD && useShadowVariant) {
+        CT[1] = 0;
+      } else {
+        CT[2] = 0;
+        SrcSelect[2] = 1;
+      }
+    } else if (TextureType == TEXTURE_2D_ARRAY ||
+        TextureType == TEXTURE_SHADOW2D_ARRAY) {
+      CT[2] = 0;
+    }
+
+    if ((TextureType == TEXTURE_SHADOW1D ||
+        TextureType == TEXTURE_SHADOW2D ||
+        TextureType == TEXTURE_SHADOWRECT ||
+        TextureType == TEXTURE_SHADOW1D_ARRAY) &&
+        !(hasLOD && useShadowVariant))
+      SrcSelect[3] = 2;
+  }
+
+  void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name,
+                       unsigned SrcSelect[4], Value *Offset[3], Value *Resource,
+                       Value *Sampler, unsigned CT[4], Value *Coord) {
+    IRBuilder<> Builder(&I);
+    Constant *Mask[] = {
+      ConstantInt::get(Int32Type, SrcSelect[0]),
+      ConstantInt::get(Int32Type, SrcSelect[1]),
+      ConstantInt::get(Int32Type, SrcSelect[2]),
+      ConstantInt::get(Int32Type, SrcSelect[3])
+    };
+    Value *SwizzleMask = ConstantVector::get(Mask);
+    Value *SwizzledCoord =
+        Builder.CreateShuffleVector(Coord, Coord, SwizzleMask);
+
+    Value *Args[] = {
+      SwizzledCoord,
+      Offset[0],
+      Offset[1],
+      Offset[2],
+      Resource,
+      Sampler,
+      ConstantInt::get(Int32Type, CT[0]),
+      ConstantInt::get(Int32Type, CT[1]),
+      ConstantInt::get(Int32Type, CT[2]),
+      ConstantInt::get(Int32Type, CT[3])
+    };
+
+    Function *F = Mod->getFunction(Name);
+    if (!F) {
+      F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod);
+      F->addFnAttr(Attribute::ReadNone);
+    }
+    I.replaceAllUsesWith(Builder.CreateCall(F, Args));
+    I.eraseFromParent();
+  }
+
+  void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT,
+                           const char *VanillaInt,
+                           const char *ShadowInt) {
+    Value *Coord = I.getArgOperand(0);
+    Value *ResourceId = I.getArgOperand(1);
+    Value *SamplerId = I.getArgOperand(2);
+
+    unsigned TextureType =
+        cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+
+    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
+    unsigned CT[4] = {1, 1, 1, 1};
+    Value *Offset[3] = {
+      ConstantInt::get(Int32Type, 0),
+      ConstantInt::get(Int32Type, 0),
+      ConstantInt::get(Int32Type, 0)
+    };
+    bool useShadowVariant;
+
+    getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT,
+                                   useShadowVariant);
+
+    ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect,
+                    Offset, ResourceId, SamplerId, CT, Coord);
+  }
+
+  void ReplaceTXF(CallInst &I) {
+    Value *Coord = I.getArgOperand(0);
+    Value *ResourceId = I.getArgOperand(4);
+    Value *SamplerId = I.getArgOperand(5);
+
+    unsigned TextureType =
+        cast<ConstantInt>(I.getArgOperand(6))->getZExtValue();
+
+    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
+    unsigned CT[4] = {1, 1, 1, 1};
+    Value *Offset[3] = {
+      I.getArgOperand(1),
+      I.getArgOperand(2),
+      I.getArgOperand(3),
+    };
+    bool useShadowVariant;
+
+    getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT,
+                                   useShadowVariant);
+
+    ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect,
+                    Offset, ResourceId, SamplerId, CT, Coord);
+  }
+
+public:
+  R600TextureIntrinsicsReplacer():
+    FunctionPass(ID) {
+  }
+
+  bool doInitialization(Module &M) override {
+    LLVMContext &Ctx = M.getContext();
+    Mod = &M;
+    FloatType = Type::getFloatTy(Ctx);
+    Int32Type = Type::getInt32Ty(Ctx);
+    V4f32Type = VectorType::get(FloatType, 4);
+    V4i32Type = VectorType::get(Int32Type, 4);
+    Type *ArgsType[] = {
+      V4f32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+    };
+    TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false);
+    Type *ArgsQType[] = {
+      V4i32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+    };
+    TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false);
+    return false;
+  }
+
+  bool runOnFunction(Function &F) override {
+    visit(F);
+    return false;
+  }
+
+  const char *getPassName() const override {
+    return "R600 Texture Intrinsics Replacer";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+  }
+
+  void visitCallInst(CallInst &I) {
+    if (!I.getCalledFunction())
+      return;
+
+    StringRef Name = I.getCalledFunction()->getName();
+    if (Name == "llvm.AMDGPU.tex") {
+      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txl") {
+      ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txl", "llvm.R600.txlc");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txb") {
+      ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txb", "llvm.R600.txbc");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txf") {
+      ReplaceTXF(I);
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txq") {
+      ReplaceTexIntrinsic(I, false, TexQSign, "llvm.R600.txq", "llvm.R600.txq");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.ddx") {
+      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddx", "llvm.R600.ddx");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.ddy") {
+      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddy", "llvm.R600.ddy");
+      return;
+    }
+  }
+
+};
+
+char R600TextureIntrinsicsReplacer::ID = 0;
+
+}
+
+FunctionPass *llvm::createR600TextureIntrinsicsReplacer() {
+  return new R600TextureIntrinsicsReplacer();
+}
diff --git a/lib/Target/AMDGPU/R700Instructions.td b/lib/Target/AMDGPU/R700Instructions.td
new file mode 100644
index 00000000000..613a0d729bb
--- /dev/null
+++ b/lib/Target/AMDGPU/R700Instructions.td
@@ -0,0 +1,21 @@
+//===-- R700Instructions.td - R700 Instruction defs  -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are:
+// - Available to R700 and newer VLIW4/VLIW5 GPUs
+// - Available only on R700 family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isR700 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::R700">;
+
+let Predicates = [isR700] in {
+  def SIN_r700 : SIN_Common<0x6E>;
+  def COS_r700 : COS_Common<0x6F>;
+}
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
new file mode 100644
index 00000000000..ccfbf1bf19e
--- /dev/null
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -0,0 +1,365 @@
+//===-- SIAnnotateControlFlow.cpp -  ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Annotates the control flow with hardware specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-annotate-control-flow"
+
+namespace {
+
+// Complex types used in this pass
+typedef std::pair<BasicBlock *, Value *> StackEntry;
+typedef SmallVector<StackEntry, 16> StackVector;
+
+// Intrinsic names the control flow is annotated with
+static const char *const IfIntrinsic = "llvm.SI.if";
+static const char *const ElseIntrinsic = "llvm.SI.else";
+static const char *const BreakIntrinsic = "llvm.SI.break";
+static const char *const IfBreakIntrinsic = "llvm.SI.if.break";
+static const char *const ElseBreakIntrinsic = "llvm.SI.else.break";
+static const char *const LoopIntrinsic = "llvm.SI.loop";
+static const char *const EndCfIntrinsic = "llvm.SI.end.cf";
+
+class SIAnnotateControlFlow : public FunctionPass {
+
+  static char ID;
+
+  Type *Boolean;
+  Type *Void;
+  Type *Int64;
+  Type *ReturnStruct;
+
+  ConstantInt *BoolTrue;
+  ConstantInt *BoolFalse;
+  UndefValue *BoolUndef;
+  Constant *Int64Zero;
+
+  Constant *If;
+  Constant *Else;
+  Constant *Break;
+  Constant *IfBreak;
+  Constant *ElseBreak;
+  Constant *Loop;
+  Constant *EndCf;
+
+  DominatorTree *DT;
+  StackVector Stack;
+
+  LoopInfo *LI;
+
+  bool isTopOfStack(BasicBlock *BB);
+
+  Value *popSaved();
+
+  void push(BasicBlock *BB, Value *Saved);
+
+  bool isElse(PHINode *Phi);
+
+  void eraseIfUnused(PHINode *Phi);
+
+  void openIf(BranchInst *Term);
+
+  void insertElse(BranchInst *Term);
+
+  Value *handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L);
+
+  void handleLoop(BranchInst *Term);
+
+  void closeControlFlow(BasicBlock *BB);
+
+public:
+  SIAnnotateControlFlow():
+    FunctionPass(ID) { }
+
+  bool doInitialization(Module &M) override;
+
+  bool runOnFunction(Function &F) override;
+
+  const char *getPassName() const override {
+    return "SI annotate control flow";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+};
+
+} // end anonymous namespace
+
+char SIAnnotateControlFlow::ID = 0;
+
+/// \brief Initialize all the types and constants used in the pass
+bool SIAnnotateControlFlow::doInitialization(Module &M) {
+  LLVMContext &Context = M.getContext();
+
+  Void = Type::getVoidTy(Context);
+  Boolean = Type::getInt1Ty(Context);
+  Int64 = Type::getInt64Ty(Context);
+  ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr);
+
+  BoolTrue = ConstantInt::getTrue(Context);
+  BoolFalse = ConstantInt::getFalse(Context);
+  BoolUndef = UndefValue::get(Boolean);
+  Int64Zero = ConstantInt::get(Int64, 0);
+
+  If = M.getOrInsertFunction(
+    IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr);
+
+  Else = M.getOrInsertFunction(
+    ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr);
+
+  Break = M.getOrInsertFunction(
+    BreakIntrinsic, Int64, Int64, (Type *)nullptr);
+
+  IfBreak = M.getOrInsertFunction(
+    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
+
+  ElseBreak = M.getOrInsertFunction(
+    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
+
+  Loop = M.getOrInsertFunction(
+    LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
+
+  EndCf = M.getOrInsertFunction(
+    EndCfIntrinsic, Void, Int64, (Type *)nullptr);
+
+  return false;
+}
+
+/// \brief Is BB the last block saved on the stack ?
+bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
+  return !Stack.empty() && Stack.back().first == BB;
+}
+
+/// \brief Pop the last saved value from the control flow stack
+Value *SIAnnotateControlFlow::popSaved() {
+  return Stack.pop_back_val().second;
+}
+
+/// \brief Push a BB and saved value to the control flow stack
+void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) {
+  Stack.push_back(std::make_pair(BB, Saved));
+}
+
+/// \brief Can the condition represented by this PHI node treated like
+/// an "Else" block?
+bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
+  BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock();
+  for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+    if (Phi->getIncomingBlock(i) == IDom) {
+
+      if (Phi->getIncomingValue(i) != BoolTrue)
+        return false;
+
+    } else {
+      if (Phi->getIncomingValue(i) != BoolFalse)
+        return false;
+
+    }
+  }
+  return true;
+}
+
+// \brief Erase "Phi" if it is not used any more
+void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
+  if (!Phi->hasNUsesOrMore(1))
+    Phi->eraseFromParent();
+}
+
+/// \brief Open a new "If" block
+void SIAnnotateControlFlow::openIf(BranchInst *Term) {
+  Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
+  Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
+  push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+}
+
+/// \brief Close the last "If" block and open a new "Else" block
+void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
+  Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
+  Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
+  push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+}
+
+/// \brief Recursively handle the condition leading to a loop
+Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
+                                                  llvm::Loop *L) {
+
+  // Only search through PHI nodes which are inside the loop.  If we try this
+  // with PHI nodes that are outside of the loop, we end up inserting new PHI
+  // nodes outside of the loop which depend on values defined inside the loop.
+  // This will break the module with
+  // 'Instruction does not dominate all users!' errors.
+  PHINode *Phi = nullptr;
+  if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) {
+
+    BasicBlock *Parent = Phi->getParent();
+    PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front());
+    Value *Ret = NewPhi;
+
+    // Handle all non-constant incoming values first
+    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+      Value *Incoming = Phi->getIncomingValue(i);
+      BasicBlock *From = Phi->getIncomingBlock(i);
+      if (isa<ConstantInt>(Incoming)) {
+        NewPhi->addIncoming(Broken, From);
+        continue;
+      }
+
+      Phi->setIncomingValue(i, BoolFalse);
+      Value *PhiArg = handleLoopCondition(Incoming, Broken, L);
+      NewPhi->addIncoming(PhiArg, From);
+    }
+
+    BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
+
+    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+
+      Value *Incoming = Phi->getIncomingValue(i);
+      if (Incoming != BoolTrue)
+        continue;
+
+      BasicBlock *From = Phi->getIncomingBlock(i);
+      if (From == IDom) {
+        CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
+        if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
+          Value *Args[] = { OldEnd->getArgOperand(0), NewPhi };
+          Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
+          continue;
+        }
+      }
+      TerminatorInst *Insert = From->getTerminator();
+      Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
+      NewPhi->setIncomingValue(i, PhiArg);
+    }
+    eraseIfUnused(Phi);
+    return Ret;
+
+  } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
+    BasicBlock *Parent = Inst->getParent();
+    Instruction *Insert;
+    if (L->contains(Inst)) {
+      Insert = Parent->getTerminator();
+    } else {
+      Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
+    }
+    Value *Args[] = { Cond, Broken };
+    return CallInst::Create(IfBreak, Args, "", Insert);
+
+  } else {
+    llvm_unreachable("Unhandled loop condition!");
+  }
+  return 0;
+}
+
+/// \brief Handle a back edge (loop)
+void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
+  BasicBlock *BB = Term->getParent();
+  llvm::Loop *L = LI->getLoopFor(BB);
+  BasicBlock *Target = Term->getSuccessor(1);
+  PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
+
+  Value *Cond = Term->getCondition();
+  Term->setCondition(BoolTrue);
+  Value *Arg = handleLoopCondition(Cond, Broken, L);
+
+  for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
+       PI != PE; ++PI) {
+
+    Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI);
+  }
+
+  Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
+  push(Term->getSuccessor(0), Arg);
+}/// \brief Close the last opened control flow
+void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
+  llvm::Loop *L = LI->getLoopFor(BB);
+
+  if (L && L->getHeader() == BB) {
+    // We can't insert an EndCF call into a loop header, because it will
+    // get executed on every iteration of the loop, when it should be
+    // executed only once before the loop.
+    SmallVector <BasicBlock*, 8> Latches;
+    L->getLoopLatches(Latches);
+
+    std::vector<BasicBlock*> Preds;
+    for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+      if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end())
+        Preds.push_back(*PI);
+    }
+    BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT,
+                                      LI, false);
+  }
+
+  CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt());
+}
+
+/// \brief Annotate the control flow with intrinsics so the backend can
+/// recognize if/then/else and loops.
+bool SIAnnotateControlFlow::runOnFunction(Function &F) {
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+  for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
+       E = df_end(&F.getEntryBlock()); I != E; ++I) {
+
+    BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator());
+
+    if (!Term || Term->isUnconditional()) {
+      if (isTopOfStack(*I))
+        closeControlFlow(*I);
+      continue;
+    }
+
+    if (I.nodeVisited(Term->getSuccessor(1))) {
+      if (isTopOfStack(*I))
+        closeControlFlow(*I);
+      handleLoop(Term);
+      continue;
+    }
+
+    if (isTopOfStack(*I)) {
+      PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
+      if (Phi && Phi->getParent() == *I && isElse(Phi)) {
+        insertElse(Term);
+        eraseIfUnused(Phi);
+        continue;
+      }
+      closeControlFlow(*I);
+    }
+    openIf(Term);
+  }
+
+  assert(Stack.empty());
+  return true;
+}
+
+/// \brief Create the annotation pass
+FunctionPass *llvm::createSIAnnotateControlFlowPass() {
+  return new SIAnnotateControlFlow();
+}
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
new file mode 100644
index 00000000000..4727d971ab7
--- /dev/null
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -0,0 +1,172 @@
+//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCInstrDesc.h"
+
+#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H
+#define LLVM_LIB_TARGET_R600_SIDEFINES_H
+
+namespace SIInstrFlags {
+// This needs to be kept in sync with the field bits in InstSI.
+enum {
+  SALU = 1 << 3,
+  VALU = 1 << 4,
+
+  SOP1 = 1 << 5,
+  SOP2 = 1 << 6,
+  SOPC = 1 << 7,
+  SOPK = 1 << 8,
+  SOPP = 1 << 9,
+
+  VOP1 = 1 << 10,
+  VOP2 = 1 << 11,
+  VOP3 = 1 << 12,
+  VOPC = 1 << 13,
+
+  MUBUF = 1 << 14,
+  MTBUF = 1 << 15,
+  SMRD = 1 << 16,
+  DS = 1 << 17,
+  MIMG = 1 << 18,
+  FLAT = 1 << 19,
+  WQM = 1 << 20,
+  VGPRSpill = 1 << 21
+};
+}
+
+namespace llvm {
+namespace AMDGPU {
+  enum OperandType {
+    /// Operand with register or 32-bit immediate
+    OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET,
+    /// Operand with register or inline constant
+    OPERAND_REG_INLINE_C
+  };
+}
+}
+
+namespace SIInstrFlags {
+  enum Flags {
+    // First 4 bits are the instruction encoding
+    VM_CNT = 1 << 0,
+    EXP_CNT = 1 << 1,
+    LGKM_CNT = 1 << 2
+  };
+
+  // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
+  // The result is true if any of these tests are true.
+  enum ClassFlags {
+    S_NAN = 1 << 0,        // Signaling NaN
+    Q_NAN = 1 << 1,        // Quiet NaN
+    N_INFINITY = 1 << 2,   // Negative infinity
+    N_NORMAL = 1 << 3,     // Negative normal
+    N_SUBNORMAL = 1 << 4,  // Negative subnormal
+    N_ZERO = 1 << 5,       // Negative zero
+    P_ZERO = 1 << 6,       // Positive zero
+    P_SUBNORMAL = 1 << 7,  // Positive subnormal
+    P_NORMAL = 1 << 8,     // Positive normal
+    P_INFINITY = 1 << 9    // Positive infinity
+  };
+}
+
+namespace SISrcMods {
+  enum {
+   NEG = 1 << 0,
+   ABS = 1 << 1
+  };
+}
+
+namespace SIOutMods {
+  enum {
+    NONE = 0,
+    MUL2 = 1,
+    MUL4 = 2,
+    DIV2 = 3
+  };
+}
+
+#define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
+#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS                                0x00B02C
+#define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
+#define R_00B128_SPI_SHADER_PGM_RSRC1_VS                                0x00B128
+#define R_00B228_SPI_SHADER_PGM_RSRC1_GS                                0x00B228
+#define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
+#define   S_00B028_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   S_00B028_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define R_00B84C_COMPUTE_PGM_RSRC2                                      0x00B84C
+#define   S_00B84C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
+#define   S_00B84C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
+#define   S_00B84C_TGID_X_EN(x)                                       (((x) & 0x1) << 7)
+#define   S_00B84C_TGID_Y_EN(x)                                       (((x) & 0x1) << 8)
+#define   S_00B84C_TGID_Z_EN(x)                                       (((x) & 0x1) << 9)
+#define   S_00B84C_TG_SIZE_EN(x)                                      (((x) & 0x1) << 10)
+#define   S_00B84C_TIDIG_COMP_CNT(x)                                  (((x) & 0x03) << 11)
+
+#define   S_00B84C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 15)
+#define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
+
+
+#define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
+#define   S_00B848_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   G_00B848_VGPRS(x)                                           (((x) >> 0) & 0x3F)
+#define   C_00B848_VGPRS                                              0xFFFFFFC0
+#define   S_00B848_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define   G_00B848_SGPRS(x)                                           (((x) >> 6) & 0x0F)
+#define   C_00B848_SGPRS                                              0xFFFFFC3F
+#define   S_00B848_PRIORITY(x)                                        (((x) & 0x03) << 10)
+#define   G_00B848_PRIORITY(x)                                        (((x) >> 10) & 0x03)
+#define   C_00B848_PRIORITY                                           0xFFFFF3FF
+#define   S_00B848_FLOAT_MODE(x)                                      (((x) & 0xFF) << 12)
+#define   G_00B848_FLOAT_MODE(x)                                      (((x) >> 12) & 0xFF)
+#define   C_00B848_FLOAT_MODE                                         0xFFF00FFF
+#define   S_00B848_PRIV(x)                                            (((x) & 0x1) << 20)
+#define   G_00B848_PRIV(x)                                            (((x) >> 20) & 0x1)
+#define   C_00B848_PRIV                                               0xFFEFFFFF
+#define   S_00B848_DX10_CLAMP(x)                                      (((x) & 0x1) << 21)
+#define   G_00B848_DX10_CLAMP(x)                                      (((x) >> 21) & 0x1)
+#define   C_00B848_DX10_CLAMP                                         0xFFDFFFFF
+#define   S_00B848_DEBUG_MODE(x)                                      (((x) & 0x1) << 22)
+#define   G_00B848_DEBUG_MODE(x)                                      (((x) >> 22) & 0x1)
+#define   C_00B848_DEBUG_MODE                                         0xFFBFFFFF
+#define   S_00B848_IEEE_MODE(x)                                       (((x) & 0x1) << 23)
+#define   G_00B848_IEEE_MODE(x)                                       (((x) >> 23) & 0x1)
+#define   C_00B848_IEEE_MODE                                          0xFF7FFFFF
+
+
+// Helpers for setting FLOAT_MODE
+#define FP_ROUND_ROUND_TO_NEAREST 0
+#define FP_ROUND_ROUND_TO_INF 1
+#define FP_ROUND_ROUND_TO_NEGINF 2
+#define FP_ROUND_ROUND_TO_ZERO 3
+
+// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double
+// precision.
+#define FP_ROUND_MODE_SP(x) ((x) & 0x3)
+#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2)
+
+#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0
+#define FP_DENORM_FLUSH_OUT 1
+#define FP_DENORM_FLUSH_IN 2
+#define FP_DENORM_FLUSH_NONE 3
+
+
+// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double
+// precision.
+#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4)
+#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
+
+#define R_00B860_COMPUTE_TMPRING_SIZE                                   0x00B860
+#define   S_00B860_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
+
+#define R_0286E8_SPI_TMPRING_SIZE                                       0x0286E8
+#define   S_0286E8_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
+
+
+#endif
diff --git a/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
new file mode 100644
index 00000000000..5fe8d19426d
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
@@ -0,0 +1,96 @@
+//===-- SIFixControlFlowLiveIntervals.cpp - Fix CF live intervals ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Spilling of EXEC masks used for control flow messes up control flow
+/// lowering, so mark all live intervals associated with CF instructions as
+/// non-spillable.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-cf-live-intervals"
+
+namespace {
+
+class SIFixControlFlowLiveIntervals : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFixControlFlowLiveIntervals() : MachineFunctionPass(ID) {
+    initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Fix CF Live Intervals";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFixControlFlowLiveIntervals, DEBUG_TYPE,
+                      "SI Fix CF Live Intervals", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIFixControlFlowLiveIntervals, DEBUG_TYPE,
+                    "SI Fix CF Live Intervals", false, false)
+
+char SIFixControlFlowLiveIntervals::ID = 0;
+
+char &llvm::SIFixControlFlowLiveIntervalsID = SIFixControlFlowLiveIntervals::ID;
+
+FunctionPass *llvm::createSIFixControlFlowLiveIntervalsPass() {
+  return new SIFixControlFlowLiveIntervals();
+}
+
+bool SIFixControlFlowLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
+  LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      switch (MI.getOpcode()) {
+        case AMDGPU::SI_IF:
+        case AMDGPU::SI_ELSE:
+        case AMDGPU::SI_BREAK:
+        case AMDGPU::SI_IF_BREAK:
+        case AMDGPU::SI_ELSE_BREAK:
+        case AMDGPU::SI_END_CF: {
+          unsigned Reg = MI.getOperand(0).getReg();
+          LIS->getInterval(Reg).markNotSpillable();
+          break;
+        }
+        default:
+          break;
+      }
+    }
+  }
+
+  return false;
+}
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
new file mode 100644
index 00000000000..23502b45905
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -0,0 +1,338 @@
+//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Copies from VGPR to SGPR registers are illegal and the register coalescer
+/// will sometimes generate these illegal copies in situations like this:
+///
+///  Register Class <vsrc> is the union of <vgpr> and <sgpr>
+///
+/// BB0:
+///   %vreg0 <sgpr> = SCALAR_INST
+///   %vreg1 <vsrc> = COPY %vreg0 <sgpr>
+///    ...
+///    BRANCH %cond BB1, BB2
+///  BB1:
+///    %vreg2 <vgpr> = VECTOR_INST
+///    %vreg3 <vsrc> = COPY %vreg2 <vgpr>
+///  BB2:
+///    %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1>
+///    %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc>
+///
+///
+/// The coalescer will begin at BB0 and eliminate its copy, then the resulting
+/// code will look like this:
+///
+/// BB0:
+///   %vreg0 <sgpr> = SCALAR_INST
+///    ...
+///    BRANCH %cond BB1, BB2
+/// BB1:
+///   %vreg2 <vgpr> = VECTOR_INST
+///   %vreg3 <vsrc> = COPY %vreg2 <vgpr>
+/// BB2:
+///   %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <vsrc>, <BB#1>
+///   %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
+///
+/// Now that the result of the PHI instruction is an SGPR, the register
+/// allocator is now forced to constrain the register class of %vreg3 to
+/// <sgpr> so we end up with final code like this:
+///
+/// BB0:
+///   %vreg0 <sgpr> = SCALAR_INST
+///    ...
+///    BRANCH %cond BB1, BB2
+/// BB1:
+///   %vreg2 <vgpr> = VECTOR_INST
+///   %vreg3 <sgpr> = COPY %vreg2 <vgpr>
+/// BB2:
+///   %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1>
+///   %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
+///
+/// Now this code contains an illegal copy from a VGPR to an SGPR.
+///
+/// In order to avoid this problem, this pass searches for PHI instructions
+/// which define a <vsrc> register and constrains its definition class to
+/// <vgpr> if the user of the PHI's definition register is a vector instruction.
+/// If the PHI's definition class is constrained to <vgpr> then the coalescer
+/// will be unable to perform the COPY removal from the above example  which
+/// ultimately led to the creation of an illegal COPY.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sgpr-copies"
+
+namespace {
+
+class SIFixSGPRCopies : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI,
+                                           const MachineRegisterInfo &MRI,
+                                           unsigned Reg,
+                                           unsigned SubReg) const;
+  const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI,
+                                                 const MachineRegisterInfo &MRI,
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const;
+  bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI,
+                        const MachineRegisterInfo &MRI) const;
+
+public:
+  SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Fix SGPR copies";
+  }
+
+};
+
+} // End anonymous namespace
+
+char SIFixSGPRCopies::ID = 0;
+
+FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) {
+  return new SIFixSGPRCopies(tm);
+}
+
+static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    if (!MI.getOperand(i).isReg() ||
+        !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+      continue;
+
+    if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg())))
+      return true;
+  }
+  return false;
+}
+
+/// This functions walks the use list of Reg until it finds an Instruction
+/// that isn't a COPY returns the register class of that instruction.
+/// \return The register defined by the first non-COPY instruction.
+const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses(
+                                                 const SIRegisterInfo *TRI,
+                                                 const MachineRegisterInfo &MRI,
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const {
+
+  const TargetRegisterClass *RC
+    = TargetRegisterInfo::isVirtualRegister(Reg) ?
+    MRI.getRegClass(Reg) :
+    TRI->getPhysRegClass(Reg);
+
+  RC = TRI->getSubRegClass(RC, SubReg);
+  for (MachineRegisterInfo::use_instr_iterator
+       I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) {
+    switch (I->getOpcode()) {
+    case AMDGPU::COPY:
+      RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI,
+                                  I->getOperand(0).getReg(),
+                                  I->getOperand(0).getSubReg()));
+      break;
+    }
+  }
+
+  return RC;
+}
+
+const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef(
+                                                 const SIRegisterInfo *TRI,
+                                                 const MachineRegisterInfo &MRI,
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg);
+    return TRI->getSubRegClass(RC, SubReg);
+  }
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+  if (Def->getOpcode() != AMDGPU::COPY) {
+    return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg);
+  }
+
+  return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(),
+                                   Def->getOperand(1).getSubReg());
+}
+
+bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
+                                      const SIRegisterInfo *TRI,
+                                      const MachineRegisterInfo &MRI) const {
+
+  unsigned DstReg = Copy.getOperand(0).getReg();
+  unsigned SrcReg = Copy.getOperand(1).getReg();
+  unsigned SrcSubReg = Copy.getOperand(1).getSubReg();
+
+  if (!TargetRegisterInfo::isVirtualRegister(DstReg)) {
+    // If the destination register is a physical register there isn't really
+    // much we can do to fix this.
+    return false;
+  }
+
+  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+
+  const TargetRegisterClass *SrcRC;
+
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+      MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass)
+    return false;
+
+  SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg);
+  return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC);
+}
+
+bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                      I != E; ++I) {
+      MachineInstr &MI = *I;
+      if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) {
+        DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n");
+        DEBUG(MI.print(dbgs()));
+        TII->moveToVALU(MI);
+
+      }
+
+      switch (MI.getOpcode()) {
+      default: continue;
+      case AMDGPU::PHI: {
+        DEBUG(dbgs() << "Fixing PHI: " << MI);
+
+        for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+          const MachineOperand &Op = MI.getOperand(i);
+          unsigned Reg = Op.getReg();
+          const TargetRegisterClass *RC
+            = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg());
+
+          MRI.constrainRegClass(Op.getReg(), RC);
+        }
+        unsigned Reg = MI.getOperand(0).getReg();
+        const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg,
+                                                  MI.getOperand(0).getSubReg());
+        if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) {
+          MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass);
+        }
+
+        if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
+          break;
+
+        // If a PHI node defines an SGPR and any of its operands are VGPRs,
+        // then we need to move it to the VALU.
+        //
+        // Also, if a PHI node defines an SGPR and has all SGPR operands
+        // we must move it to the VALU, because the SGPR operands will
+        // all end up being assigned the same register, which means
+        // there is a potential for a conflict if different threads take
+        // different control flow paths.
+        //
+        // For Example:
+        //
+        // sgpr0 = def;
+        // ...
+        // sgpr1 = def;
+        // ...
+        // sgpr2 = PHI sgpr0, sgpr1
+        // use sgpr2;
+        //
+        // Will Become:
+        //
+        // sgpr2 = def;
+        // ...
+        // sgpr2 = def;
+        // ...
+        // use sgpr2
+        //
+        // FIXME: This is OK if the branching decision is made based on an
+        // SGPR value.
+        bool SGPRBranch = false;
+
+        // The one exception to this rule is when one of the operands
+        // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
+        // instruction.  In this case, there we know the program will
+        // never enter the second block (the loop) without entering
+        // the first block (where the condition is computed), so there
+        // is no chance for values to be over-written.
+
+        bool HasBreakDef = false;
+        for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
+          unsigned Reg = MI.getOperand(i).getReg();
+          if (TRI->hasVGPRs(MRI.getRegClass(Reg))) {
+            TII->moveToVALU(MI);
+            break;
+          }
+          MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg);
+          assert(DefInstr);
+          switch(DefInstr->getOpcode()) {
+
+          case AMDGPU::SI_BREAK:
+          case AMDGPU::SI_IF_BREAK:
+          case AMDGPU::SI_ELSE_BREAK:
+          // If we see a PHI instruction that defines an SGPR, then that PHI
+          // instruction has already been considered and should have
+          // a *_BREAK as an operand.
+          case AMDGPU::PHI:
+            HasBreakDef = true;
+            break;
+          }
+        }
+
+        if (!SGPRBranch && !HasBreakDef)
+          TII->moveToVALU(MI);
+        break;
+      }
+      case AMDGPU::REG_SEQUENCE: {
+        if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
+            !hasVGPROperands(MI, TRI))
+          continue;
+
+        DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
+
+        TII->moveToVALU(MI);
+        break;
+      }
+      case AMDGPU::INSERT_SUBREG: {
+        const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
+        DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
+        Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
+        Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
+        if (TRI->isSGPRClass(DstRC) &&
+            (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
+          DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
+          TII->moveToVALU(MI);
+        }
+        break;
+      }
+      }
+    }
+  }
+
+  return true;
+}
diff --git a/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
new file mode 100644
index 00000000000..0c54446b0fb
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
@@ -0,0 +1,192 @@
+//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// SALU instructions ignore control flow, so we need to modify the live ranges
+/// of the registers they define in some cases.
+///
+/// The main case we need to handle is when a def is used in one side of a
+/// branch and not another.  For example:
+///
+/// %def
+/// IF
+///   ...
+///   ...
+/// ELSE
+///   %use
+///   ...
+/// ENDIF
+///
+/// Here we need the register allocator to avoid assigning any of the defs
+/// inside of the IF to the same register as %def.  In traditional live
+/// interval analysis %def is not live inside the IF branch, however, since
+/// SALU instructions inside of IF will be executed even if the branch is not
+/// taken, there is the chance that one of the instructions will overwrite the
+/// value of %def, so the use in ELSE will see the wrong value.
+///
+/// The strategy we use for solving this is to add an extra use after the ENDIF:
+///
+/// %def
+/// IF
+///   ...
+///   ...
+/// ELSE
+///   %use
+///   ...
+/// ENDIF
+/// %use
+///
+/// Adding this use will make the def live thoughout the IF branch, which is
+/// what we want.
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-sgpr-live-ranges"
+
+namespace {
+
+class SIFixSGPRLiveRanges : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFixSGPRLiveRanges() : MachineFunctionPass(ID) {
+    initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Fix SGPR live ranges";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
+    AU.addRequired<MachinePostDominatorTree>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE,
+                      "SI Fix SGPR Live Ranges", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE,
+                    "SI Fix SGPR Live Ranges", false, false)
+
+char SIFixSGPRLiveRanges::ID = 0;
+
+char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID;
+
+FunctionPass *llvm::createSIFixSGPRLiveRangesPass() {
+  return new SIFixSGPRLiveRanges();
+}
+
+bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
+  LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+ MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
+  std::vector<std::pair<unsigned, LiveRange *>> SGPRLiveRanges;
+
+  // First pass, collect all live intervals for SGPRs
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      for (const MachineOperand &MO : MI.defs()) {
+        if (MO.isImplicit())
+          continue;
+        unsigned Def = MO.getReg();
+        if (TargetRegisterInfo::isVirtualRegister(Def)) {
+          if (TRI->isSGPRClass(MRI.getRegClass(Def)))
+            SGPRLiveRanges.push_back(
+                std::make_pair(Def, &LIS->getInterval(Def)));
+        } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) {
+            SGPRLiveRanges.push_back(
+                std::make_pair(Def, &LIS->getRegUnit(Def)));
+        }
+      }
+    }
+  }
+
+  // Second pass fix the intervals
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+    MachineBasicBlock &MBB = *BI;
+    if (MBB.succ_size() < 2)
+      continue;
+
+    // We have structured control flow, so number of succesors should be two.
+    assert(MBB.succ_size() == 2);
+    MachineBasicBlock *SuccA = *MBB.succ_begin();
+    MachineBasicBlock *SuccB = *(++MBB.succ_begin());
+    MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB);
+
+    if (!NCD)
+      continue;
+
+    MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator();
+
+    if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) {
+      assert(NCD->succ_size() == 2);
+      // We want to make sure we insert the Use after the ENDIF, not after
+      // the ELSE.
+      NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(),
+                                            *(++NCD->succ_begin()));
+    }
+    assert(SuccA && SuccB);
+    for (std::pair<unsigned, LiveRange*> RegLR : SGPRLiveRanges) {
+      unsigned Reg = RegLR.first;
+      LiveRange *LR = RegLR.second;
+
+      // FIXME: We could be smarter here.  If the register is Live-In to
+      // one block, but the other doesn't have any SGPR defs, then there
+      // won't be a conflict.  Also, if the branch decision is based on
+      // a value in an SGPR, then there will be no conflict.
+      bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA);
+      bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB);
+
+      if ((!LiveInToA && !LiveInToB) ||
+          (LiveInToA && LiveInToB))
+        continue;
+
+      // This interval is live in to one successor, but not the other, so
+      // we need to update its range so it is live in to both.
+      DEBUG(dbgs() << "Possible SGPR conflict detected " <<  " in " << *LR <<
+                      " BB#" << SuccA->getNumber() << ", BB#" <<
+                      SuccB->getNumber() <<
+                      " with NCD = " << NCD->getNumber() << '\n');
+
+      // FIXME: Need to figure out how to update LiveRange here so this pass
+      // will be able to preserve LiveInterval analysis.
+      BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(),
+              TII->get(AMDGPU::SGPR_USE))
+              .addReg(Reg, RegState::Implicit);
+      DEBUG(NCD->getFirstNonPHI()->dump());
+    }
+  }
+
+  return false;
+}
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
new file mode 100644
index 00000000000..d14e37a6461
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -0,0 +1,288 @@
+//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-fold-operands"
+using namespace llvm;
+
+namespace {
+
+class SIFoldOperands : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFoldOperands() : MachineFunctionPass(ID) {
+    initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Fold Operands";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+struct FoldCandidate {
+  MachineInstr *UseMI;
+  unsigned UseOpNo;
+  MachineOperand *OpToFold;
+  uint64_t ImmToFold;
+
+  FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
+                UseMI(MI), UseOpNo(OpNo) {
+
+    if (FoldOp->isImm()) {
+      OpToFold = nullptr;
+      ImmToFold = FoldOp->getImm();
+    } else {
+      assert(FoldOp->isReg());
+      OpToFold = FoldOp;
+    }
+  }
+
+  bool isImm() const {
+    return !OpToFold;
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE,
+                      "SI Fold Operands", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE,
+                    "SI Fold Operands", false, false)
+
+char SIFoldOperands::ID = 0;
+
+char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
+
+FunctionPass *llvm::createSIFoldOperandsPass() {
+  return new SIFoldOperands();
+}
+
+static bool isSafeToFold(unsigned Opcode) {
+  switch(Opcode) {
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+  case AMDGPU::V_MOV_B64_PSEUDO:
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::COPY:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool updateOperand(FoldCandidate &Fold,
+                          const TargetRegisterInfo &TRI) {
+  MachineInstr *MI = Fold.UseMI;
+  MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
+  assert(Old.isReg());
+
+  if (Fold.isImm()) {
+    Old.ChangeToImmediate(Fold.ImmToFold);
+    return true;
+  }
+
+  MachineOperand *New = Fold.OpToFold;
+  if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
+      TargetRegisterInfo::isVirtualRegister(New->getReg())) {
+    Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
+    return true;
+  }
+
+  // FIXME: Handle physical registers.
+
+  return false;
+}
+
+static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
+                             MachineInstr *MI, unsigned OpNo,
+                             MachineOperand *OpToFold,
+                             const SIInstrInfo *TII) {
+  if (!TII->isOperandLegal(MI, OpNo, OpToFold)) {
+    // Operand is not legal, so try to commute the instruction to
+    // see if this makes it possible to fold.
+    unsigned CommuteIdx0;
+    unsigned CommuteIdx1;
+    bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);
+
+    if (CanCommute) {
+      if (CommuteIdx0 == OpNo)
+        OpNo = CommuteIdx1;
+      else if (CommuteIdx1 == OpNo)
+        OpNo = CommuteIdx0;
+    }
+
+    if (!CanCommute || !TII->commuteInstruction(MI))
+      return false;
+
+    if (!TII->isOperandLegal(MI, OpNo, OpToFold))
+      return false;
+  }
+
+  FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+  return true;
+}
+
+bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      if (!isSafeToFold(MI.getOpcode()))
+        continue;
+
+      unsigned OpSize = TII->getOpSize(MI, 1);
+      MachineOperand &OpToFold = MI.getOperand(1);
+      bool FoldingImm = OpToFold.isImm();
+
+      // FIXME: We could also be folding things like FrameIndexes and
+      // TargetIndexes.
+      if (!FoldingImm && !OpToFold.isReg())
+        continue;
+
+      // Folding immediates with more than one use will increase program size.
+      // FIXME: This will also reduce register usage, which may be better
+      // in some cases.  A better heuristic is needed.
+      if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) &&
+          !MRI.hasOneUse(MI.getOperand(0).getReg()))
+        continue;
+
+      // FIXME: Fold operands with subregs.
+      if (OpToFold.isReg() &&
+          (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) ||
+           OpToFold.getSubReg()))
+        continue;
+
+      std::vector<FoldCandidate> FoldList;
+      for (MachineRegisterInfo::use_iterator
+           Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
+           Use != E; ++Use) {
+
+        MachineInstr *UseMI = Use->getParent();
+        const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo());
+
+        // FIXME: Fold operands with subregs.
+        if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) ||
+            UseOp.isImplicit())) {
+          continue;
+        }
+
+        APInt Imm;
+
+        if (FoldingImm) {
+          unsigned UseReg = UseOp.getReg();
+          const TargetRegisterClass *UseRC
+            = TargetRegisterInfo::isVirtualRegister(UseReg) ?
+            MRI.getRegClass(UseReg) :
+            TRI.getPhysRegClass(UseReg);
+
+          Imm = APInt(64, OpToFold.getImm());
+
+          // Split 64-bit constants into 32-bits for folding.
+          if (UseOp.getSubReg()) {
+            if (UseRC->getSize() != 8)
+              continue;
+
+            if (UseOp.getSubReg() == AMDGPU::sub0) {
+              Imm = Imm.getLoBits(32);
+            } else {
+              assert(UseOp.getSubReg() == AMDGPU::sub1);
+              Imm = Imm.getHiBits(32);
+            }
+          }
+
+          // In order to fold immediates into copies, we need to change the
+          // copy to a MOV.
+          if (UseMI->getOpcode() == AMDGPU::COPY) {
+            unsigned DestReg = UseMI->getOperand(0).getReg();
+            const TargetRegisterClass *DestRC
+              = TargetRegisterInfo::isVirtualRegister(DestReg) ?
+              MRI.getRegClass(DestReg) :
+              TRI.getPhysRegClass(DestReg);
+
+            unsigned MovOp = TII->getMovOpcode(DestRC);
+            if (MovOp == AMDGPU::COPY)
+              continue;
+
+            UseMI->setDesc(TII->get(MovOp));
+          }
+        }
+
+        const MCInstrDesc &UseDesc = UseMI->getDesc();
+
+        // Don't fold into target independent nodes.  Target independent opcodes
+        // don't have defined register classes.
+        if (UseDesc.isVariadic() ||
+            UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
+          continue;
+
+        if (FoldingImm) {
+          MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+          tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII);
+          continue;
+        }
+
+        tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII);
+
+        // FIXME: We could try to change the instruction from 64-bit to 32-bit
+        // to enable more folding opportunites.  The shrink operands pass
+        // already does this.
+      }
+
+      for (FoldCandidate &Fold : FoldList) {
+        if (updateOperand(Fold, TRI)) {
+          // Clear kill flags.
+          if (!Fold.isImm()) {
+            assert(Fold.OpToFold && Fold.OpToFold->isReg());
+            Fold.OpToFold->setIsKill(false);
+          }
+          DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
+                Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
+        }
+      }
+    }
+  }
+  return false;
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
new file mode 100644
index 00000000000..12d08cf4c7f
--- /dev/null
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -0,0 +1,2241 @@
+//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Custom DAG lowering for SI
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef _MSC_VER
+// Provide M_PI.
+#define _USE_MATH_DEFINES
+#include <cmath>
+#endif
+
+#include "SIISelLowering.h"
+#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/ADT/SmallString.h"
+
+using namespace llvm;
+
+SITargetLowering::SITargetLowering(TargetMachine &TM,
+                                   const AMDGPUSubtarget &STI)
+    : AMDGPUTargetLowering(TM, STI) {
+  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
+  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
+
+  addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
+  addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
+
+  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
+
+  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+
+  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
+  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+
+  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
+  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+
+  addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
+  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+
+  computeRegisterProperties(STI.getRegisterInfo());
+
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+
+  setOperationAction(ISD::ADD, MVT::i32, Legal);
+  setOperationAction(ISD::ADDC, MVT::i32, Legal);
+  setOperationAction(ISD::ADDE, MVT::i32, Legal);
+  setOperationAction(ISD::SUBC, MVT::i32, Legal);
+  setOperationAction(ISD::SUBE, MVT::i32, Legal);
+
+  setOperationAction(ISD::FSIN, MVT::f32, Custom);
+  setOperationAction(ISD::FCOS, MVT::f32, Custom);
+
+  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+
+  // We need to custom lower vector stores from local memory
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
+
+  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v16i32, Custom);
+
+  setOperationAction(ISD::STORE, MVT::i1, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+
+  setOperationAction(ISD::SELECT, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT, MVT::f64, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
+
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+
+  setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
+  setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
+
+  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+
+  for (MVT VT : MVT::integer_valuetypes()) {
+    if (VT == MVT::i64)
+      continue;
+
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
+
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
+  }
+
+  for (MVT VT : MVT::integer_vector_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand);
+  }
+
+  for (MVT VT : MVT::fp_valuetypes())
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+
+  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+
+  // These should use UDIVREM, so set them to expand
+  setOperationAction(ISD::UDIV, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+  setOperationAction(ISD::SELECT, MVT::i1, Promote);
+
+  // We only support LOAD/STORE and vector manipulation ops for vectors
+  // with > 4 elements.
+  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) {
+    for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
+      switch(Op) {
+      case ISD::LOAD:
+      case ISD::STORE:
+      case ISD::BUILD_VECTOR:
+      case ISD::BITCAST:
+      case ISD::EXTRACT_VECTOR_ELT:
+      case ISD::INSERT_VECTOR_ELT:
+      case ISD::INSERT_SUBVECTOR:
+      case ISD::EXTRACT_SUBVECTOR:
+        break;
+      case ISD::CONCAT_VECTORS:
+        setOperationAction(Op, VT, Custom);
+        break;
+      default:
+        setOperationAction(Op, VT, Expand);
+        break;
+      }
+    }
+  }
+
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
+    setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+    setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+    setOperationAction(ISD::FRINT, MVT::f64, Legal);
+  }
+
+  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+  setOperationAction(ISD::FDIV, MVT::f32, Custom);
+  setOperationAction(ISD::FDIV, MVT::f64, Custom);
+
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
+  setTargetDAGCombine(ISD::FMINNUM);
+  setTargetDAGCombine(ISD::FMAXNUM);
+  setTargetDAGCombine(ISD::SMIN);
+  setTargetDAGCombine(ISD::SMAX);
+  setTargetDAGCombine(ISD::UMIN);
+  setTargetDAGCombine(ISD::UMAX);
+  setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
+
+  // All memory operations. Some folding on the pointer operand is done to help
+  // matching the constant offsets in the addressing modes.
+  setTargetDAGCombine(ISD::LOAD);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD);
+  setTargetDAGCombine(ISD::ATOMIC_STORE);
+  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
+  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+  setTargetDAGCombine(ISD::ATOMIC_SWAP);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
+
+  setSchedulingPreference(Sched::RegPressure);
+}
+
+//===----------------------------------------------------------------------===//
+// TargetLowering queries
+//===----------------------------------------------------------------------===//
+
+bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
+                                          EVT) const {
+  // SI has some legal vector types, but no legal vector operations. Say no
+  // shuffles are legal in order to prefer scalarizing some vector operations.
+  return false;
+}
+
+bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                             Type *Ty, unsigned AS) const {
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  switch (AS) {
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
+  case AMDGPUAS::PRIVATE_ADDRESS:
+  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
+    // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
+    // additionally can do r + r + i with addr64. 32-bit has more addressing
+    // mode options. Depending on the resource constant, it can also do
+    // (i64 r0) + (i32 r1) * (i14 i).
+    //
+    // SMRD instructions have an 8-bit, dword offset.
+    //
+    // Assume nonunifom access, since the address space isn't enough to know
+    // what instruction we will use, and since we don't know if this is a load
+    // or store and scalar stores are only available on VI.
+    //
+    // We also know if we are doing an extload, we can't do a scalar load.
+    //
+    // Private arrays end up using a scratch buffer most of the time, so also
+    // assume those use MUBUF instructions. Scratch loads / stores are currently
+    // implemented as mubuf instructions with offen bit set, so slightly
+    // different than the normal addr64.
+    if (!isUInt<12>(AM.BaseOffs))
+      return false;
+
+    // FIXME: Since we can split immediate into soffset and immediate offset,
+    // would it make sense to allow any immediate?
+
+    switch (AM.Scale) {
+    case 0: // r + i or just i, depending on HasBaseReg.
+      return true;
+    case 1:
+      return true; // We have r + r or r + i.
+    case 2:
+      if (AM.HasBaseReg) {
+        // Reject 2 * r + r.
+        return false;
+      }
+
+      // Allow 2 * r as r + r
+      // Or  2 * r + i is allowed as r + r + i.
+      return true;
+    default: // Don't allow n * r
+      return false;
+    }
+  }
+  case AMDGPUAS::LOCAL_ADDRESS:
+  case AMDGPUAS::REGION_ADDRESS: {
+    // Basic, single offset DS instructions allow a 16-bit unsigned immediate
+    // field.
+    // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
+    // an 8-bit dword offset but we don't know the alignment here.
+    if (!isUInt<16>(AM.BaseOffs))
+      return false;
+
+    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
+      return true;
+
+    if (AM.Scale == 1 && AM.HasBaseReg)
+      return true;
+
+    return false;
+  }
+  case AMDGPUAS::FLAT_ADDRESS: {
+    // Flat instructions do not have offsets, and only have the register
+    // address.
+    return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
+  }
+  default:
+    llvm_unreachable("unhandled address space");
+  }
+}
+
+bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                      unsigned AddrSpace,
+                                                      unsigned Align,
+                                                      bool *IsFast) const {
+  if (IsFast)
+    *IsFast = false;
+
+  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
+  // which isn't a simple VT.
+  if (!VT.isSimple() || VT == MVT::Other)
+    return false;
+
+  // TODO - CI+ supports unaligned memory accesses, but this requires driver
+  // support.
+
+  // XXX - The only mention I see of this in the ISA manual is for LDS direct
+  // reads the "byte address and must be dword aligned". Is it also true for the
+  // normal loads and stores?
+  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) {
+    // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
+    // aligned, 8 byte access in a single operation using ds_read2/write2_b32
+    // with adjacent offsets.
+    return Align % 4 == 0;
+  }
+
+  // Smaller than dword value must be aligned.
+  // FIXME: This should be allowed on CI+
+  if (VT.bitsLT(MVT::i32))
+    return false;
+
+  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
+  // byte-address are ignored, thus forcing Dword alignment.
+  // This applies to private, global, and constant memory.
+  if (IsFast)
+    *IsFast = true;
+
+  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
+}
+
+EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                          unsigned SrcAlign, bool IsMemset,
+                                          bool ZeroMemset,
+                                          bool MemcpyStrSrc,
+                                          MachineFunction &MF) const {
+  // FIXME: Should account for address space here.
+
+  // The default fallback uses the private pointer size as a guess for a type to
+  // use. Make sure we switch these to 64-bit accesses.
+
+  if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
+    return MVT::v4i32;
+
+  if (Size >= 8 && DstAlign >= 4)
+    return MVT::v2i32;
+
+  // Use the default.
+  return MVT::Other;
+}
+
+TargetLoweringBase::LegalizeTypeAction
+SITargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
+    return TypeSplitVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                         Type *Ty) const {
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  return TII->isInlineConstant(Imm);
+}
+
+static EVT toIntegerVT(EVT VT) {
+  if (VT.isVector())
+    return VT.changeVectorElementTypeToInteger();
+  return MVT::getIntegerVT(VT.getSizeInBits());
+}
+
+SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
+                                         SDLoc SL, SDValue Chain,
+                                         unsigned Offset, bool Signed) const {
+  const DataLayout *DL = getDataLayout();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
+  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
+
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+  MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
+                                       MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
+  SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                            DAG.getConstant(Offset, SL, PtrVT));
+  SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
+  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+
+  unsigned Align = DL->getABITypeAlignment(Ty);
+
+  if (VT != MemVT && VT.isFloatingPoint()) {
+    // Do an integer load and convert.
+    // FIXME: This is mostly because load legalization after type legalization
+    // doesn't handle FP extloads.
+    assert(VT.getScalarType() == MVT::f32 &&
+           MemVT.getScalarType() == MVT::f16);
+
+    EVT IVT = toIntegerVT(VT);
+    EVT MemIVT = toIntegerVT(MemVT);
+    SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD,
+                               IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT,
+                               false, // isVolatile
+                               true, // isNonTemporal
+                               true, // isInvariant
+                               Align); // Alignment
+    return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load);
+  }
+
+  ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+  return DAG.getLoad(ISD::UNINDEXED, ExtTy,
+                     VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
+                     false, // isVolatile
+                     true, // isNonTemporal
+                     true, // isInvariant
+                     Align); // Alignment
+}
+
+SDValue SITargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  FunctionType *FType = MF.getFunction()->getFunctionType();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  assert(CallConv == CallingConv::C);
+
+  SmallVector<ISD::InputArg, 16> Splits;
+  BitVector Skipped(Ins.size());
+
+  for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
+    const ISD::InputArg &Arg = Ins[i];
+
+    // First check if it's a PS input addr
+    if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
+        !Arg.Flags.isByVal()) {
+
+      assert((PSInputNum <= 15) && "Too many PS inputs!");
+
+      if (!Arg.Used) {
+        // We can savely skip PS inputs
+        Skipped.set(i);
+        ++PSInputNum;
+        continue;
+      }
+
+      Info->PSInputAddr |= 1 << PSInputNum++;
+    }
+
+    // Second split vertices into their elements
+    if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) {
+      ISD::InputArg NewArg = Arg;
+      NewArg.Flags.setSplit();
+      NewArg.VT = Arg.VT.getVectorElementType();
+
+      // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
+      // three or five element vertex only needs three or five registers,
+      // NOT four or eigth.
+      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      for (unsigned j = 0; j != NumElements; ++j) {
+        Splits.push_back(NewArg);
+        NewArg.PartOffset += NewArg.VT.getStoreSize();
+      }
+
+    } else if (Info->getShaderType() != ShaderType::COMPUTE) {
+      Splits.push_back(Arg);
+    }
+  }
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+
+  // At least one interpolation mode must be enabled or else the GPU will hang.
+  if (Info->getShaderType() == ShaderType::PIXEL &&
+      (Info->PSInputAddr & 0x7F) == 0) {
+    Info->PSInputAddr |= 1;
+    CCInfo.AllocateReg(AMDGPU::VGPR0);
+    CCInfo.AllocateReg(AMDGPU::VGPR1);
+  }
+
+  // The pointer to the list of arguments is stored in SGPR0, SGPR1
+	// The pointer to the scratch buffer is stored in SGPR2, SGPR3
+  if (Info->getShaderType() == ShaderType::COMPUTE) {
+    if (Subtarget->isAmdHsaOS())
+      Info->NumUserSGPRs = 2;  // FIXME: Need to support scratch buffers.
+    else
+      Info->NumUserSGPRs = 4;
+
+    unsigned InputPtrReg =
+        TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
+    unsigned InputPtrRegLo =
+        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
+    unsigned InputPtrRegHi =
+        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
+
+    unsigned ScratchPtrReg =
+        TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
+    unsigned ScratchPtrRegLo =
+        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0);
+    unsigned ScratchPtrRegHi =
+        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1);
+
+    CCInfo.AllocateReg(InputPtrRegLo);
+    CCInfo.AllocateReg(InputPtrRegHi);
+    CCInfo.AllocateReg(ScratchPtrRegLo);
+    CCInfo.AllocateReg(ScratchPtrRegHi);
+    MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+    MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass);
+  }
+
+  if (Info->getShaderType() == ShaderType::COMPUTE) {
+    getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
+                            Splits);
+  }
+
+  AnalyzeFormalArguments(CCInfo, Splits);
+
+  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+
+    const ISD::InputArg &Arg = Ins[i];
+    if (Skipped[i]) {
+      InVals.push_back(DAG.getUNDEF(Arg.VT));
+      continue;
+    }
+
+    CCValAssign &VA = ArgLocs[ArgIdx++];
+    MVT VT = VA.getLocVT();
+
+    if (VA.isMemLoc()) {
+      VT = Ins[i].VT;
+      EVT MemVT = Splits[i].VT;
+      const unsigned Offset = 36 + VA.getLocMemOffset();
+      // The first 36 bytes of the input buffer contains information about
+      // thread group and global sizes.
+      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
+                                   Offset, Ins[i].Flags.isSExt());
+
+      const PointerType *ParamTy =
+        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
+      if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+          ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+        // On SI local pointers are just offsets into LDS, so they are always
+        // less than 16-bits.  On CI and newer they could potentially be
+        // real pointers, so we can't guarantee their size.
+        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
+                          DAG.getValueType(MVT::i16));
+      }
+
+      InVals.push_back(Arg);
+      Info->ABIArgOffset = Offset + MemVT.getStoreSize();
+      continue;
+    }
+    assert(VA.isRegLoc() && "Parameter must be in a register!");
+
+    unsigned Reg = VA.getLocReg();
+
+    if (VT == MVT::i64) {
+      // For now assume it is a pointer
+      Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
+                                     &AMDGPU::SReg_64RegClass);
+      Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
+      InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+      continue;
+    }
+
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+
+    Reg = MF.addLiveIn(Reg, RC);
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+
+    if (Arg.VT.isVector()) {
+
+      // Build a vector from the registers
+      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      SmallVector<SDValue, 4> Regs;
+      Regs.push_back(Val);
+      for (unsigned j = 1; j != NumElements; ++j) {
+        Reg = ArgLocs[ArgIdx++].getLocReg();
+        Reg = MF.addLiveIn(Reg, RC);
+        Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+      }
+
+      // Fill up the missing vector elements
+      NumElements = Arg.VT.getVectorNumElements() - NumElements;
+      Regs.append(NumElements, DAG.getUNDEF(VT));
+
+      InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
+      continue;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  if (Info->getShaderType() != ShaderType::COMPUTE) {
+    unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>(
+        AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
+    Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
+  }
+  return Chain;
+}
+
+MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr * MI, MachineBasicBlock * BB) const {
+
+  MachineBasicBlock::iterator I = *MI;
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+  switch (MI->getOpcode()) {
+  default:
+    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case AMDGPU::BRANCH:
+    return BB;
+  case AMDGPU::SI_RegisterStorePseudo: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
+                Reg);
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
+      MIB.addOperand(MI->getOperand(i));
+
+    MI->eraseFromParent();
+    break;
+  }
+  }
+  return BB;
+}
+
+bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+  // This currently forces unfolding various combinations of fsub into fma with
+  // free fneg'd operands. As long as we have fast FMA (controlled by
+  // isFMAFasterThanFMulAndFAdd), we should perform these.
+
+  // When fma is quarter rate, for f64 where add / sub are at best half rate,
+  // most of these combines appear to be cycle neutral but save on instruction
+  // count / code size.
+  return true;
+}
+
+EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const {
+  if (!VT.isVector()) {
+    return MVT::i1;
+  }
+  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
+}
+
+MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
+  return MVT::i32;
+}
+
+// Answering this is somewhat tricky and depends on the specific device which
+// have different rates for fma or all f64 operations.
+//
+// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
+// regardless of which device (although the number of cycles differs between
+// devices), so it is always profitable for f64.
+//
+// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
+// only on full rate devices. Normally, we should prefer selecting v_mad_f32
+// which we can always do even without fused FP ops since it returns the same
+// result as the separate operations and since it is always full
+// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
+// however does not support denormals, so we do report fma as faster if we have
+// a fast fma device and require denormals.
+//
+bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+    // This is as fast on some subtargets. However, we always have full rate f32
+    // mad available which returns the same result as the separate operations
+    // which we should prefer over fma. We can't use this if we want to support
+    // denormals, so only report this in these cases.
+    return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Lowering Operations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
+  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+  case ISD::LOAD: {
+    SDValue Result = LowerLOAD(Op, DAG);
+    assert((!Result.getNode() ||
+            Result.getNode()->getNumValues() == 2) &&
+           "Load should return a value and a chain");
+    return Result;
+  }
+
+  case ISD::FSIN:
+  case ISD::FCOS:
+    return LowerTrig(Op, DAG);
+  case ISD::SELECT: return LowerSELECT(Op, DAG);
+  case ISD::FDIV: return LowerFDIV(Op, DAG);
+  case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::GlobalAddress: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    return LowerGlobalAddress(MFI, Op, DAG);
+  }
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
+  }
+  return SDValue();
+}
+
+/// \brief Helper function for LowerBRCOND
+static SDNode *findUser(SDValue Value, unsigned Opcode) {
+
+  SDNode *Parent = Value.getNode();
+  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
+       I != E; ++I) {
+
+    if (I.getUse().get() != Value)
+      continue;
+
+    if (I->getOpcode() == Opcode)
+      return *I;
+  }
+  return nullptr;
+}
+
+SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
+
+  FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
+  unsigned FrameIndex = FINode->getIndex();
+
+  return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+}
+
+/// This transforms the control flow intrinsics to get the branch destination as
+/// last parameter, also switches branch target with BR if the need arise
+SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
+                                      SelectionDAG &DAG) const {
+
+  SDLoc DL(BRCOND);
+
+  SDNode *Intr = BRCOND.getOperand(1).getNode();
+  SDValue Target = BRCOND.getOperand(2);
+  SDNode *BR = nullptr;
+
+  if (Intr->getOpcode() == ISD::SETCC) {
+    // As long as we negate the condition everything is fine
+    SDNode *SetCC = Intr;
+    assert(SetCC->getConstantOperandVal(1) == 1);
+    assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
+           ISD::SETNE);
+    Intr = SetCC->getOperand(0).getNode();
+
+  } else {
+    // Get the target from BR if we don't negate the condition
+    BR = findUser(BRCOND, ISD::BR);
+    Target = BR->getOperand(1);
+  }
+
+  assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+
+  // Build the result and
+  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
+
+  // operands of the new intrinsic call
+  SmallVector<SDValue, 4> Ops;
+  Ops.push_back(BRCOND.getOperand(0));
+  Ops.append(Intr->op_begin() + 1, Intr->op_end());
+  Ops.push_back(Target);
+
+  // build the new intrinsic call
+  SDNode *Result = DAG.getNode(
+    Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
+    DAG.getVTList(Res), Ops).getNode();
+
+  if (BR) {
+    // Give the branch instruction our target
+    SDValue Ops[] = {
+      BR->getOperand(0),
+      BRCOND.getOperand(2)
+    };
+    SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
+    DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
+    BR = NewBR.getNode();
+  }
+
+  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
+
+  // Copy the intrinsic results to registers
+  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
+    SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
+    if (!CopyToReg)
+      continue;
+
+    Chain = DAG.getCopyToReg(
+      Chain, DL,
+      CopyToReg->getOperand(1),
+      SDValue(Result, i - 1),
+      SDValue());
+
+    DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
+  }
+
+  // Remove the old intrinsic from the chain
+  DAG.ReplaceAllUsesOfValueWith(
+    SDValue(Intr, Intr->getNumValues() - 1),
+    Intr->getOperand(0));
+
+  return Chain;
+}
+
+SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
+                                             SDValue Op,
+                                             SelectionDAG &DAG) const {
+  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+
+  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
+
+  SDLoc DL(GSD);
+  const GlobalValue *GV = GSD->getGlobal();
+  MVT PtrVT = getPointerTy(GSD->getAddressSpace());
+
+  SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
+  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
+
+  SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
+                              DAG.getConstant(0, DL, MVT::i32));
+  SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
+                              DAG.getConstant(1, DL, MVT::i32));
+
+  SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
+                           PtrLo, GA);
+  SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
+                           PtrHi, DAG.getConstant(0, DL, MVT::i32),
+                           SDValue(Lo.getNode(), 1));
+  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+}
+
+SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
+                                   SDValue V) const {
+  // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
+  // so we will end up with redundant moves to m0.
+  //
+  // We can't use S_MOV_B32, because there is no way to specify m0 as the
+  // destination register.
+  //
+  // We have to use them both.  Machine cse will combine all the S_MOV_B32
+  // instructions and the register coalescer eliminate the extra copies.
+  SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V);
+  return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32),
+                          SDValue(M0, 0), SDValue()); // Glue
+                                                      // A Null SDValue creates
+                                                      // a glue result.
+}
+
+SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+  switch (IntrinsicID) {
+  case Intrinsic::r600_read_ngroups_x:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::NGROUPS_X, false);
+  case Intrinsic::r600_read_ngroups_y:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::NGROUPS_Y, false);
+  case Intrinsic::r600_read_ngroups_z:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::NGROUPS_Z, false);
+  case Intrinsic::r600_read_global_size_x:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+  case Intrinsic::r600_read_global_size_y:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+  case Intrinsic::r600_read_global_size_z:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+  case Intrinsic::r600_read_local_size_x:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::LOCAL_SIZE_X, false);
+  case Intrinsic::r600_read_local_size_y:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::LOCAL_SIZE_Y, false);
+  case Intrinsic::r600_read_local_size_z:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
+
+  case Intrinsic::AMDGPU_read_workdim:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset,
+                          false);
+
+  case Intrinsic::r600_read_tgid_x:
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
+  case Intrinsic::r600_read_tgid_y:
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT);
+  case Intrinsic::r600_read_tgid_z:
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
+  case Intrinsic::r600_read_tidig_x:
+    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
+  case Intrinsic::r600_read_tidig_y:
+    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
+  case Intrinsic::r600_read_tidig_z:
+    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
+  case AMDGPUIntrinsic::SI_load_const: {
+    SDValue Ops[] = {
+      Op.getOperand(1),
+      Op.getOperand(2)
+    };
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+  case AMDGPUIntrinsic::SI_sample:
+    return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
+  case AMDGPUIntrinsic::SI_sampleb:
+    return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
+  case AMDGPUIntrinsic::SI_sampled:
+    return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
+  case AMDGPUIntrinsic::SI_samplel:
+    return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
+  case AMDGPUIntrinsic::SI_vs_load_input:
+    return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
+                       Op.getOperand(1),
+                       Op.getOperand(2),
+                       Op.getOperand(3));
+
+  case AMDGPUIntrinsic::AMDGPU_fract:
+  case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
+    return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1),
+                       DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1)));
+  case AMDGPUIntrinsic::SI_fs_constant: {
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
+    SDValue Glue = M0.getValue(1);
+    return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
+                       DAG.getConstant(2, DL, MVT::i32), // P0
+                       Op.getOperand(1), Op.getOperand(2), Glue);
+  }
+  case AMDGPUIntrinsic::SI_fs_interp: {
+    SDValue IJ = Op.getOperand(4);
+    SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
+                            DAG.getConstant(0, DL, MVT::i32));
+    SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
+                            DAG.getConstant(1, DL, MVT::i32));
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
+    SDValue Glue = M0.getValue(1);
+    SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL,
+                             DAG.getVTList(MVT::f32, MVT::Glue),
+                             I, Op.getOperand(1), Op.getOperand(2), Glue);
+    Glue = SDValue(P1.getNode(), 1);
+    return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
+                             Op.getOperand(1), Op.getOperand(2), Glue);
+  }
+  default:
+    return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  }
+}
+
+SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDLoc DL(Op);
+  SDValue Chain = Op.getOperand(0);
+  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+  switch (IntrinsicID) {
+  case AMDGPUIntrinsic::SI_sendmsg: {
+    Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+    SDValue Glue = Chain.getValue(1);
+    return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
+                       Op.getOperand(2), Glue);
+  }
+  case AMDGPUIntrinsic::SI_tbuffer_store: {
+    SDValue Ops[] = {
+      Chain,
+      Op.getOperand(2),
+      Op.getOperand(3),
+      Op.getOperand(4),
+      Op.getOperand(5),
+      Op.getOperand(6),
+      Op.getOperand(7),
+      Op.getOperand(8),
+      Op.getOperand(9),
+      Op.getOperand(10),
+      Op.getOperand(11),
+      Op.getOperand(12),
+      Op.getOperand(13),
+      Op.getOperand(14)
+    };
+
+    EVT VT = Op.getOperand(3).getValueType();
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOStore,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+  default:
+    return SDValue();
+  }
+}
+
+SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+
+  if (Op.getValueType().isVector()) {
+    assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
+           "Custom lowering for non-i32 vectors hasn't been implemented.");
+    unsigned NumElements = Op.getValueType().getVectorNumElements();
+    assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
+    switch (Load->getAddressSpace()) {
+      default: break;
+      case AMDGPUAS::GLOBAL_ADDRESS:
+      case AMDGPUAS::PRIVATE_ADDRESS:
+        // v4 loads are supported for private and global memory.
+        if (NumElements <= 4)
+          break;
+        // fall-through
+      case AMDGPUAS::LOCAL_ADDRESS:
+        return ScalarizeVectorLoad(Op, DAG);
+    }
+  }
+
+  return AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+}
+
+SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
+                                               const SDValue &Op,
+                                               SelectionDAG &DAG) const {
+  return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
+                     Op.getOperand(2),
+                     Op.getOperand(3),
+                     Op.getOperand(4));
+}
+
+SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getValueType() != MVT::i64)
+    return SDValue();
+
+  SDLoc DL(Op);
+  SDValue Cond = Op.getOperand(0);
+
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+  SDValue One = DAG.getConstant(1, DL, MVT::i32);
+
+  SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
+  SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
+
+  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
+  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
+
+  SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
+
+  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
+  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
+
+  SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
+
+  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi);
+  return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
+}
+
+// Catch division cases where we can use shortcuts with rcp and rsq
+// instructions.
+SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
+
+  if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
+    if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) &&
+        CLHS->isExactlyValue(1.0)) {
+      // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+      // the CI documentation has a worst case error of 1 ulp.
+      // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+      // use it as long as we aren't trying to use denormals.
+
+      // 1.0 / sqrt(x) -> rsq(x)
+      //
+      // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
+      // error seems really high at 2^29 ULP.
+      if (RHS.getOpcode() == ISD::FSQRT)
+        return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
+
+      // 1.0 / x -> rcp(x)
+      return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+    }
+  }
+
+  if (Unsafe) {
+    // Turn into multiply by the reciprocal.
+    // x / y -> x * (1.0 / y)
+    SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip);
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
+  SDValue FastLowered = LowerFastFDIV(Op, DAG);
+  if (FastLowered.getNode())
+    return FastLowered;
+
+  // This uses v_rcp_f32 which does not handle denormals. Let this hit a
+  // selection error for now rather than do something incorrect.
+  if (Subtarget->hasFP32Denormals())
+    return SDValue();
+
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+
+  const APFloat K0Val(BitsToFloat(0x6f800000));
+  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+
+  const APFloat K1Val(BitsToFloat(0x2f800000));
+  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+
+  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+
+  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+
+  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+
+  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+
+  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+
+  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+}
+
+SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
+  if (DAG.getTarget().Options.UnsafeFPMath)
+    return LowerFastFDIV(Op, DAG);
+
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+
+  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
+
+  SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
+
+  SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
+
+  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
+
+  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
+
+  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
+
+  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
+
+  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
+
+  SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
+
+  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
+
+  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
+                             NegDivScale0, Mul, DivScale1);
+
+  SDValue Scale;
+
+  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    // Workaround a hardware bug on SI where the condition output from div_scale
+    // is not usable.
+
+    const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
+
+    // Figure out if the scale to use for div_fmas.
+    SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
+    SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
+    SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
+    SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
+
+    SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
+    SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
+
+    SDValue Scale0Hi
+      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
+    SDValue Scale1Hi
+      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
+
+    SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
+    SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
+    Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
+  } else {
+    Scale = DivScale1.getValue(1);
+  }
+
+  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
+                             Fma4, Fma3, Mul, Scale);
+
+  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
+}
+
+SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::f32)
+    return LowerFDIV32(Op, DAG);
+
+  if (VT == MVT::f64)
+    return LowerFDIV64(Op, DAG);
+
+  llvm_unreachable("Unexpected type for fdiv");
+}
+
+SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT VT = Store->getMemoryVT();
+
+  // These stores are legal.
+  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+    if (VT.isVector() && VT.getVectorNumElements() > 4)
+      return ScalarizeVectorStore(Op, DAG);
+    return SDValue();
+  }
+
+  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+  if (Ret.getNode())
+    return Ret;
+
+  if (VT.isVector() && VT.getVectorNumElements() >= 8)
+      return ScalarizeVectorStore(Op, DAG);
+
+  if (VT == MVT::i1)
+    return DAG.getTruncStore(Store->getChain(), DL,
+                        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
+                        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Arg = Op.getOperand(0);
+  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
+                                  DAG.getNode(ISD::FMUL, DL, VT, Arg,
+                                              DAG.getConstantFP(0.5/M_PI, DL,
+                                                                VT)));
+
+  switch (Op.getOpcode()) {
+  case ISD::FCOS:
+    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
+  case ISD::FSIN:
+    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
+  default:
+    llvm_unreachable("Wrong trig opcode");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
+                                                     DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+  EVT ScalarVT = VT.getScalarType();
+  if (ScalarVT != MVT::f32)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  // TODO: We could try to match extracting the higher bytes, which would be
+  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
+  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
+  // about in practice.
+  if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
+    if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
+      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
+      DCI.AddToWorklist(Cvt.getNode());
+      return Cvt;
+    }
+  }
+
+  // We are primarily trying to catch operations on illegal vector types
+  // before they are expanded.
+  // For scalars, we can use the more flexible method of checking masked bits
+  // after legalization.
+  if (!DCI.isBeforeLegalize() ||
+      !SrcVT.isVector() ||
+      SrcVT.getVectorElementType() != MVT::i8) {
+    return SDValue();
+  }
+
+  assert(DCI.isBeforeLegalize() && "Unexpected legal type");
+
+  // Weird sized vectors are a pain to handle, but we know 3 is really the same
+  // size as 4.
+  unsigned NElts = SrcVT.getVectorNumElements();
+  if (!SrcVT.isSimple() && NElts != 3)
+    return SDValue();
+
+  // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
+  // prevent a mess from expanding to v4i32 and repacking.
+  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+    EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
+    EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
+    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
+    LoadSDNode *Load = cast<LoadSDNode>(Src);
+
+    unsigned AS = Load->getAddressSpace();
+    unsigned Align = Load->getAlignment();
+    Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
+    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
+
+    // Don't try to replace the load if we have to expand it due to alignment
+    // problems. Otherwise we will end up scalarizing the load, and trying to
+    // repack into the vector for no real reason.
+    if (Align < ABIAlignment &&
+        !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) {
+      return SDValue();
+    }
+
+    SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
+                                     Load->getChain(),
+                                     Load->getBasePtr(),
+                                     LoadVT,
+                                     Load->getMemOperand());
+
+    // Make sure successors of the original load stay after it by updating
+    // them to use the new Chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
+
+    SmallVector<SDValue, 4> Elts;
+    if (RegVT.isVector())
+      DAG.ExtractVectorElements(NewLoad, Elts);
+    else
+      Elts.push_back(NewLoad);
+
+    SmallVector<SDValue, 4> Ops;
+
+    unsigned EltIdx = 0;
+    for (SDValue Elt : Elts) {
+      unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
+      for (unsigned I = 0; I < ComponentsInElt; ++I) {
+        unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
+        SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
+        DCI.AddToWorklist(Cvt.getNode());
+        Ops.push_back(Cvt);
+      }
+
+      ++EltIdx;
+    }
+
+    assert(Ops.size() == NElts);
+
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops);
+  }
+
+  return SDValue();
+}
+
+/// \brief Return true if the given offset Size in bytes can be folded into
+/// the immediate offsets of a memory instruction for the given address space.
+static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
+                          const AMDGPUSubtarget &STI) {
+  switch (AS) {
+  case AMDGPUAS::GLOBAL_ADDRESS: {
+    // MUBUF instructions a 12-bit offset in bytes.
+    return isUInt<12>(OffsetSize);
+  }
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    // SMRD instructions have an 8-bit offset in dwords on SI and
+    // a 20-bit offset in bytes on VI.
+    if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+      return isUInt<20>(OffsetSize);
+    else
+      return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
+  }
+  case AMDGPUAS::LOCAL_ADDRESS:
+  case AMDGPUAS::REGION_ADDRESS: {
+    // The single offset versions have a 16-bit offset in bytes.
+    return isUInt<16>(OffsetSize);
+  }
+  case AMDGPUAS::PRIVATE_ADDRESS:
+  // Indirect register addressing does not use any offsets.
+  default:
+    return 0;
+  }
+}
+
+// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
+
+// This is a variant of
+// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
+//
+// The normal DAG combiner will do this, but only if the add has one use since
+// that would increase the number of instructions.
+//
+// This prevents us from seeing a constant offset that can be folded into a
+// memory instruction's addressing mode. If we know the resulting add offset of
+// a pointer can be folded into an addressing offset, we can replace the pointer
+// operand with the add of new constant offset. This eliminates one of the uses,
+// and may allow the remaining use to also be simplified.
+//
+SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
+                                               unsigned AddrSpace,
+                                               DAGCombinerInfo &DCI) const {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  if (N0.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
+  if (!CN1)
+    return SDValue();
+
+  const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (!CAdd)
+    return SDValue();
+
+  // If the resulting offset is too large, we can't fold it into the addressing
+  // mode offset.
+  APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
+  if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget))
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+  EVT VT = N->getValueType(0);
+
+  SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
+  SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
+
+  return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
+}
+
+SDValue SITargetLowering::performAndCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+
+  // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
+  // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  if (LHS.getOpcode() == ISD::SETCC &&
+      RHS.getOpcode() == ISD::SETCC) {
+    ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+    ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
+
+    SDValue X = LHS.getOperand(0);
+    SDValue Y = RHS.getOperand(0);
+    if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
+      return SDValue();
+
+    if (LCC == ISD::SETO) {
+      if (X != LHS.getOperand(1))
+        return SDValue();
+
+      if (RCC == ISD::SETUNE) {
+        const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
+        if (!C1 || !C1->isInfinity() || C1->isNegative())
+          return SDValue();
+
+        const uint32_t Mask = SIInstrFlags::N_NORMAL |
+                              SIInstrFlags::N_SUBNORMAL |
+                              SIInstrFlags::N_ZERO |
+                              SIInstrFlags::P_ZERO |
+                              SIInstrFlags::P_SUBNORMAL |
+                              SIInstrFlags::P_NORMAL;
+
+        static_assert(((~(SIInstrFlags::S_NAN |
+                          SIInstrFlags::Q_NAN |
+                          SIInstrFlags::N_INFINITY |
+                          SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
+                      "mask not equal");
+
+        SDLoc DL(N);
+        return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
+                           X, DAG.getConstant(Mask, DL, MVT::i32));
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performOrCombine(SDNode *N,
+                                           DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
+  if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
+      RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
+    SDValue Src = LHS.getOperand(0);
+    if (Src != RHS.getOperand(0))
+      return SDValue();
+
+    const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+    const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+    if (!CLHS || !CRHS)
+      return SDValue();
+
+    // Only 10 bits are used.
+    static const uint32_t MaxMask = 0x3ff;
+
+    uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
+    SDLoc DL(N);
+    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
+                       Src, DAG.getConstant(NewMask, DL, MVT::i32));
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performClassCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Mask = N->getOperand(1);
+
+  // fp_class x, 0 -> false
+  if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
+    if (CMask->isNullValue())
+      return DAG.getConstant(0, SDLoc(N), MVT::i1);
+  }
+
+  return SDValue();
+}
+
+static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
+  switch (Opc) {
+  case ISD::FMAXNUM:
+    return AMDGPUISD::FMAX3;
+  case ISD::SMAX:
+    return AMDGPUISD::SMAX3;
+  case ISD::UMAX:
+    return AMDGPUISD::UMAX3;
+  case ISD::FMINNUM:
+    return AMDGPUISD::FMIN3;
+  case ISD::SMIN:
+    return AMDGPUISD::SMIN3;
+  case ISD::UMIN:
+    return AMDGPUISD::UMIN3;
+  default:
+    llvm_unreachable("Not a min/max opcode");
+  }
+}
+
+SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  unsigned Opc = N->getOpcode();
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // Only do this if the inner op has one use since this will just increases
+  // register pressure for no benefit.
+
+  // max(max(a, b), c)
+  if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
+    SDLoc DL(N);
+    return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+                       DL,
+                       N->getValueType(0),
+                       Op0.getOperand(0),
+                       Op0.getOperand(1),
+                       Op1);
+  }
+
+  // max(a, max(b, c))
+  if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
+    SDLoc DL(N);
+    return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+                       DL,
+                       N->getValueType(0),
+                       Op0,
+                       Op1.getOperand(0),
+                       Op1.getOperand(1));
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performSetCCCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  EVT VT = LHS.getValueType();
+
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return SDValue();
+
+  // Match isinf pattern
+  // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
+    const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
+    if (!CRHS)
+      return SDValue();
+
+    const APFloat &APF = CRHS->getValueAPF();
+    if (APF.isInfinity() && !APF.isNegative()) {
+      unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
+      return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
+                         DAG.getConstant(Mask, SL, MVT::i32));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  switch (N->getOpcode()) {
+  default:
+    return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+  case ISD::SETCC:
+    return performSetCCCombine(N, DCI);
+  case ISD::FMAXNUM: // TODO: What about fmax_legacy?
+  case ISD::FMINNUM:
+  case ISD::SMAX:
+  case ISD::SMIN:
+  case ISD::UMAX:
+  case ISD::UMIN: {
+    if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
+        N->getValueType(0) != MVT::f64 &&
+        getTargetMachine().getOptLevel() > CodeGenOpt::None)
+      return performMin3Max3Combine(N, DCI);
+    break;
+  }
+
+  case AMDGPUISD::CVT_F32_UBYTE0:
+  case AMDGPUISD::CVT_F32_UBYTE1:
+  case AMDGPUISD::CVT_F32_UBYTE2:
+  case AMDGPUISD::CVT_F32_UBYTE3: {
+    unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
+
+    SDValue Src = N->getOperand(0);
+    APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
+        TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
+      DCI.CommitTargetLoweringOpt(TLO);
+    }
+
+    break;
+  }
+
+  case ISD::UINT_TO_FP: {
+    return performUCharToFloatCombine(N, DCI);
+
+  case ISD::FADD: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    EVT VT = N->getValueType(0);
+    if (VT != MVT::f32)
+      break;
+
+    // Only do this if we are not trying to support denormals. v_mad_f32 does
+    // not support denormals ever.
+    if (Subtarget->hasFP32Denormals())
+      break;
+
+    SDValue LHS = N->getOperand(0);
+    SDValue RHS = N->getOperand(1);
+
+    // These should really be instruction patterns, but writing patterns with
+    // source modiifiers is a pain.
+
+    // fadd (fadd (a, a), b) -> mad 2.0, a, b
+    if (LHS.getOpcode() == ISD::FADD) {
+      SDValue A = LHS.getOperand(0);
+      if (A == LHS.getOperand(1)) {
+        const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
+        return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS);
+      }
+    }
+
+    // fadd (b, fadd (a, a)) -> mad 2.0, a, b
+    if (RHS.getOpcode() == ISD::FADD) {
+      SDValue A = RHS.getOperand(0);
+      if (A == RHS.getOperand(1)) {
+        const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
+        return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS);
+      }
+    }
+
+    return SDValue();
+  }
+  case ISD::FSUB: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    EVT VT = N->getValueType(0);
+
+    // Try to get the fneg to fold into the source modifier. This undoes generic
+    // DAG combines and folds them into the mad.
+    //
+    // Only do this if we are not trying to support denormals. v_mad_f32 does
+    // not support denormals ever.
+    if (VT == MVT::f32 &&
+        !Subtarget->hasFP32Denormals()) {
+      SDValue LHS = N->getOperand(0);
+      SDValue RHS = N->getOperand(1);
+      if (LHS.getOpcode() == ISD::FADD) {
+        // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
+
+        SDValue A = LHS.getOperand(0);
+        if (A == LHS.getOperand(1)) {
+          const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
+          SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS);
+
+          return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS);
+        }
+      }
+
+      if (RHS.getOpcode() == ISD::FADD) {
+        // (fsub c, (fadd a, a)) -> mad -2.0, a, c
+
+        SDValue A = RHS.getOperand(0);
+        if (A == RHS.getOperand(1)) {
+          const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32);
+          return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS);
+        }
+      }
+
+      return SDValue();
+    }
+
+    break;
+  }
+  }
+  case ISD::LOAD:
+  case ISD::STORE:
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE:
+  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics.
+    if (DCI.isBeforeLegalize())
+      break;
+
+    MemSDNode *MemNode = cast<MemSDNode>(N);
+    SDValue Ptr = MemNode->getBasePtr();
+
+    // TODO: We could also do this for multiplies.
+    unsigned AS = MemNode->getAddressSpace();
+    if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
+      SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
+      if (NewPtr) {
+        SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end());
+
+        NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
+        return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
+      }
+    }
+    break;
+  }
+  case ISD::AND:
+    return performAndCombine(N, DCI);
+  case ISD::OR:
+    return performOrCombine(N, DCI);
+  case AMDGPUISD::FP_CLASS:
+    return performClassCombine(N, DCI);
+  }
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+}
+
+/// \brief Analyze the possible immediate value Op
+///
+/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
+/// and the immediate value if it's a literal immediate
+int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
+
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+  if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
+    if (TII->isInlineConstant(Node->getAPIntValue()))
+      return 0;
+
+    uint64_t Val = Node->getZExtValue();
+    return isUInt<32>(Val) ? Val : -1;
+  }
+
+  if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
+    if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt()))
+      return 0;
+
+    if (Node->getValueType(0) == MVT::f32)
+      return FloatToBits(Node->getValueAPF().convertToFloat());
+
+    return -1;
+  }
+
+  return -1;
+}
+
+/// \brief Helper function for adjustWritemask
+static unsigned SubIdx2Lane(unsigned Idx) {
+  switch (Idx) {
+  default: return 0;
+  case AMDGPU::sub0: return 0;
+  case AMDGPU::sub1: return 1;
+  case AMDGPU::sub2: return 2;
+  case AMDGPU::sub3: return 3;
+  }
+}
+
+/// \brief Adjust the writemask of MIMG instructions
+void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
+                                       SelectionDAG &DAG) const {
+  SDNode *Users[4] = { };
+  unsigned Lane = 0;
+  unsigned OldDmask = Node->getConstantOperandVal(0);
+  unsigned NewDmask = 0;
+
+  // Try to figure out the used register components
+  for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
+       I != E; ++I) {
+
+    // Abort if we can't understand the usage
+    if (!I->isMachineOpcode() ||
+        I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
+      return;
+
+    // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
+    // Note that subregs are packed, i.e. Lane==0 is the first bit set
+    // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
+    // set, etc.
+    Lane = SubIdx2Lane(I->getConstantOperandVal(1));
+
+    // Set which texture component corresponds to the lane.
+    unsigned Comp;
+    for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
+      assert(Dmask);
+      Comp = countTrailingZeros(Dmask);
+      Dmask &= ~(1 << Comp);
+    }
+
+    // Abort if we have more than one user per component
+    if (Users[Lane])
+      return;
+
+    Users[Lane] = *I;
+    NewDmask |= 1 << Comp;
+  }
+
+  // Abort if there's no change
+  if (NewDmask == OldDmask)
+    return;
+
+  // Adjust the writemask in the node
+  std::vector<SDValue> Ops;
+  Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
+  Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end());
+  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
+
+  // If we only got one lane, replace it with a copy
+  // (if NewDmask has only one bit set...)
+  if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
+    SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
+                                       MVT::i32);
+    SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                      SDLoc(), Users[Lane]->getValueType(0),
+                                      SDValue(Node, 0), RC);
+    DAG.ReplaceAllUsesWith(Users[Lane], Copy);
+    return;
+  }
+
+  // Update the users of the node with the new indices
+  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
+
+    SDNode *User = Users[i];
+    if (!User)
+      continue;
+
+    SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
+    DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
+
+    switch (Idx) {
+    default: break;
+    case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
+    case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
+    case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
+    }
+  }
+}
+
+/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
+/// with frame index operands.
+/// LLVM assumes that inputs are to these instructions are registers.
+void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
+                                                     SelectionDAG &DAG) const {
+
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
+    if (!isa<FrameIndexSDNode>(Node->getOperand(i))) {
+      Ops.push_back(Node->getOperand(i));
+      continue;
+    }
+
+    SDLoc DL(Node);
+    Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
+                                     Node->getOperand(i).getValueType(),
+                                     Node->getOperand(i)), 0));
+  }
+
+  DAG.UpdateNodeOperands(Node, Ops);
+}
+
+/// \brief Fold the instructions after selecting them.
+SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
+                                          SelectionDAG &DAG) const {
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+  if (TII->isMIMG(Node->getMachineOpcode()))
+    adjustWritemask(Node, DAG);
+
+  if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG ||
+      Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) {
+    legalizeTargetIndependentNode(Node, DAG);
+    return Node;
+  }
+  return Node;
+}
+
+/// \brief Assign the register class depending on the number of
+/// bits set in the writemask
+void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                                     SDNode *Node) const {
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  TII->legalizeOperands(MI);
+
+  if (TII->isMIMG(MI->getOpcode())) {
+    unsigned VReg = MI->getOperand(0).getReg();
+    unsigned Writemask = MI->getOperand(1).getImm();
+    unsigned BitsSet = 0;
+    for (unsigned i = 0; i < 4; ++i)
+      BitsSet += Writemask & (1 << i) ? 1 : 0;
+
+    const TargetRegisterClass *RC;
+    switch (BitsSet) {
+    default: return;
+    case 1:  RC = &AMDGPU::VGPR_32RegClass; break;
+    case 2:  RC = &AMDGPU::VReg_64RegClass; break;
+    case 3:  RC = &AMDGPU::VReg_96RegClass; break;
+    }
+
+    unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
+    MI->setDesc(TII->get(NewOpcode));
+    MRI.setRegClass(VReg, RC);
+    return;
+  }
+
+  // Replace unused atomics with the no return version.
+  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode());
+  if (NoRetAtomicOp != -1) {
+    if (!Node->hasAnyUseOfValue(0)) {
+      MI->setDesc(TII->get(NoRetAtomicOp));
+      MI->RemoveOperand(0);
+    }
+
+    return;
+  }
+}
+
+static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) {
+  SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
+  return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
+}
+
+MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
+                                                SDLoc DL,
+                                                SDValue Ptr) const {
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+#if 1
+    // XXX - Workaround for moveToVALU not handling different register class
+    // inserts for REG_SEQUENCE.
+
+    // Build the half of the subregister with the constants.
+    const SDValue Ops0[] = {
+      DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
+      buildSMovImm32(DAG, DL, 0),
+      DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+      buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
+      DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+    };
+
+    SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                                  MVT::v2i32, Ops0), 0);
+
+    // Combine the constants and the pointer.
+    const SDValue Ops1[] = {
+      DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+      Ptr,
+      DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
+      SubRegHi,
+      DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
+    };
+
+    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
+#else
+    const SDValue Ops[] = {
+      DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
+      Ptr,
+      DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
+      buildSMovImm32(DAG, DL, 0),
+      DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
+      buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32),
+      DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
+    };
+
+    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+
+#endif
+}
+
+/// \brief Return a resource descriptor with the 'Add TID' bit enabled
+///        The TID (Thread ID) is multipled by the stride value (bits [61:48]
+///        of the resource descriptor) to create an offset, which is added to the
+///        resource ponter.
+MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
+                                           SDLoc DL,
+                                           SDValue Ptr,
+                                           uint32_t RsrcDword1,
+                                           uint64_t RsrcDword2And3) const {
+  SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
+  SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
+  if (RsrcDword1) {
+    PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
+                                     DAG.getConstant(RsrcDword1, DL, MVT::i32)),
+                    0);
+  }
+
+  SDValue DataLo = buildSMovImm32(DAG, DL,
+                                  RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
+  SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
+
+  const SDValue Ops[] = {
+    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+    PtrLo,
+    DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+    PtrHi,
+    DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
+    DataLo,
+    DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
+    DataHi,
+    DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
+  };
+
+  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+}
+
+MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
+                                                  SDLoc DL,
+                                                  SDValue Ptr) const {
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE |
+                  0xffffffff; // Size
+
+  return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
+}
+
+SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
+                                               const TargetRegisterClass *RC,
+                                               unsigned Reg, EVT VT) const {
+  SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
+
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
+                            cast<RegisterSDNode>(VReg)->getReg(), VT);
+}
+
+//===----------------------------------------------------------------------===//
+//                         SI Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+std::pair<unsigned, const TargetRegisterClass *>
+SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                               const std::string &Constraint,
+                                               MVT VT) const {
+  if (Constraint == "r") {
+    switch(VT.SimpleTy) {
+      default: llvm_unreachable("Unhandled type for 'r' inline asm constraint");
+      case MVT::i64:
+        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+      case MVT::i32:
+        return std::make_pair(0U, &AMDGPU::SGPR_32RegClass);
+    }
+  }
+
+  if (Constraint.size() > 1) {
+    const TargetRegisterClass *RC = nullptr;
+    if (Constraint[1] == 'v') {
+      RC = &AMDGPU::VGPR_32RegClass;
+    } else if (Constraint[1] == 's') {
+      RC = &AMDGPU::SGPR_32RegClass;
+    }
+
+    if (RC) {
+      unsigned Idx = std::atoi(Constraint.substr(2).c_str());
+      if (Idx < RC->getNumRegs())
+        return std::make_pair(RC->getRegister(Idx), RC);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
new file mode 100644
index 00000000000..a956b013bdb
--- /dev/null
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -0,0 +1,125 @@
+//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI DAG Lowering interface definition
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H
+#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H
+
+#include "AMDGPUISelLowering.h"
+#include "SIInstrInfo.h"
+
+namespace llvm {
+
+class SITargetLowering : public AMDGPUTargetLowering {
+  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL,
+                         SDValue Chain, unsigned Offset, bool Signed) const;
+  SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
+                               SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+                             SelectionDAG &DAG) const override;
+
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+
+  void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
+
+  SDValue performUCharToFloatCombine(SDNode *N,
+                                     DAGCombinerInfo &DCI) const;
+  SDValue performSHLPtrCombine(SDNode *N,
+                               unsigned AS,
+                               DAGCombinerInfo &DCI) const;
+  SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+public:
+  SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
+
+  bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
+                          EVT /*VT*/) const override;
+
+  bool isLegalAddressingMode(const AddrMode &AM,
+                             Type *Ty, unsigned AS) const override;
+
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
+                                      unsigned Align,
+                                      bool *IsFast) const override;
+
+  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                          unsigned SrcAlign, bool IsMemset,
+                          bool ZeroMemset,
+                          bool MemcpyStrSrc,
+                          MachineFunction &MF) const override;
+
+  TargetLoweringBase::LegalizeTypeAction
+  getPreferredVectorAction(EVT VT) const override;
+
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                        Type *Ty) const override;
+
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SDLoc DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+
+  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
+                                      MachineBasicBlock * BB) const override;
+  bool enableAggressiveFMAFusion(EVT VT) const override;
+  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+  MVT getScalarShiftAmountTy(EVT VT) const override;
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+  void AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                     SDNode *Node) const override;
+
+  int32_t analyzeImmediate(const SDNode *N) const;
+  SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
+                               unsigned Reg, EVT VT) const override;
+  void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
+
+  MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const;
+  MachineSDNode *buildRSRC(SelectionDAG &DAG,
+                           SDLoc DL,
+                           SDValue Ptr,
+                           uint32_t RsrcDword1,
+                           uint64_t RsrcDword2And3) const;
+  MachineSDNode *buildScratchRSRC(SelectionDAG &DAG,
+                                  SDLoc DL,
+                                  SDValue Ptr) const;
+
+  std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(
+                                   const TargetRegisterInfo *TRI,
+                                   const std::string &Constraint, MVT VT) const override;
+  SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const;
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
new file mode 100644
index 00000000000..90a37f17468
--- /dev/null
+++ b/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -0,0 +1,480 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Insert wait instructions for memory reads and writes.
+///
+/// Memory reads and writes are issued asynchronously, so we need to insert
+/// S_WAITCNT instructions when we want to access any of their results or
+/// overwrite any register that's used asynchronously.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief One variable for each of the hardware counters
+typedef union {
+  struct {
+    unsigned VM;
+    unsigned EXP;
+    unsigned LGKM;
+  } Named;
+  unsigned Array[3];
+
+} Counters;
+
+typedef enum {
+  OTHER,
+  SMEM,
+  VMEM
+} InstType;
+
+typedef Counters RegCounters[512];
+typedef std::pair<unsigned, unsigned> RegInterval;
+
+class SIInsertWaits : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+
+  /// \brief Constant hardware limits
+  static const Counters WaitCounts;
+
+  /// \brief Constant zero value
+  static const Counters ZeroCounts;
+
+  /// \brief Counter values we have already waited on.
+  Counters WaitedOn;
+
+  /// \brief Counter values for last instruction issued.
+  Counters LastIssued;
+
+  /// \brief Registers used by async instructions.
+  RegCounters UsedRegs;
+
+  /// \brief Registers defined by async instructions.
+  RegCounters DefinedRegs;
+
+  /// \brief Different export instruction types seen since last wait.
+  unsigned ExpInstrTypesSeen;
+
+  /// \brief Type of the last opcode.
+  InstType LastOpcodeType;
+
+  bool LastInstWritesM0;
+
+  /// \brief Get increment/decrement amount for this instruction.
+  Counters getHwCounts(MachineInstr &MI);
+
+  /// \brief Is operand relevant for async execution?
+  bool isOpRelevant(MachineOperand &Op);
+
+  /// \brief Get register interval an operand affects.
+  RegInterval getRegInterval(MachineOperand &Op);
+
+  /// \brief Handle instructions async components
+  void pushInstruction(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator I);
+
+  /// \brief Insert the actual wait instruction
+  bool insertWait(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  const Counters &Counts);
+
+  /// \brief Do we need def2def checks?
+  bool unorderedDefines(MachineInstr &MI);
+
+  /// \brief Resolve all operand dependencies to counter requirements
+  Counters handleOperands(MachineInstr &MI);
+
+  /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
+  void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+
+public:
+  SIInsertWaits(TargetMachine &tm) :
+    MachineFunctionPass(ID),
+    TII(nullptr),
+    TRI(nullptr),
+    ExpInstrTypesSeen(0) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI insert wait  instructions";
+  }
+
+};
+
+} // End anonymous namespace
+
+char SIInsertWaits::ID = 0;
+
+const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
+const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
+
+FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
+  return new SIInsertWaits(tm);
+}
+
+Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
+
+  uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
+  Counters Result;
+
+  Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
+
+  // Only consider stores or EXP for EXP_CNT
+  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
+      (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
+
+  // LGKM may uses larger values
+  if (TSFlags & SIInstrFlags::LGKM_CNT) {
+
+    if (TII->isSMRD(MI.getOpcode())) {
+
+      MachineOperand &Op = MI.getOperand(0);
+      assert(Op.isReg() && "First LGKM operand must be a register!");
+
+      unsigned Reg = Op.getReg();
+      unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
+      Result.Named.LGKM = Size > 4 ? 2 : 1;
+
+    } else {
+      // DS
+      Result.Named.LGKM = 1;
+    }
+
+  } else {
+    Result.Named.LGKM = 0;
+  }
+
+  return Result;
+}
+
+bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
+
+  // Constants are always irrelevant
+  if (!Op.isReg())
+    return false;
+
+  // Defines are always relevant
+  if (Op.isDef())
+    return true;
+
+  // For exports all registers are relevant
+  MachineInstr &MI = *Op.getParent();
+  if (MI.getOpcode() == AMDGPU::EXP)
+    return true;
+
+  // For stores the stored value is also relevant
+  if (!MI.getDesc().mayStore())
+    return false;
+
+  // Check if this operand is the value being stored.
+  // Special case for DS instructions, since the address
+  // operand comes before the value operand and it may have
+  // multiple data operands.
+
+  if (TII->isDS(MI.getOpcode())) {
+    MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
+    if (Data && Op.isIdenticalTo(*Data))
+      return true;
+
+    MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
+    if (Data0 && Op.isIdenticalTo(*Data0))
+      return true;
+
+    MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
+    if (Data1 && Op.isIdenticalTo(*Data1))
+      return true;
+
+    return false;
+  }
+
+  // NOTE: This assumes that the value operand is before the
+  // address operand, and that there is only one value operand.
+  for (MachineInstr::mop_iterator I = MI.operands_begin(),
+       E = MI.operands_end(); I != E; ++I) {
+
+    if (I->isReg() && I->isUse())
+      return Op.isIdenticalTo(*I);
+  }
+
+  return false;
+}
+
+RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
+
+  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
+    return std::make_pair(0, 0);
+
+  unsigned Reg = Op.getReg();
+  unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
+
+  assert(Size >= 4);
+
+  RegInterval Result;
+  Result.first = TRI->getEncodingValue(Reg);
+  Result.second = Result.first + Size / 4;
+
+  return Result;
+}
+
+void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I) {
+
+  // Get the hardware counter increments and sum them up
+  Counters Increment = getHwCounts(*I);
+  unsigned Sum = 0;
+
+  for (unsigned i = 0; i < 3; ++i) {
+    LastIssued.Array[i] += Increment.Array[i];
+    Sum += Increment.Array[i];
+  }
+
+  // If we don't increase anything then that's it
+  if (Sum == 0) {
+    LastOpcodeType = OTHER;
+    return;
+  }
+
+  if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
+      AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM
+    // or SMEM clause, respectively.
+    //
+    // The temporary workaround is to break the clauses with S_NOP.
+    //
+    // The proper solution would be to allocate registers such that all source
+    // and destination registers don't overlap, e.g. this is illegal:
+    //   r0 = load r2
+    //   r2 = load r0
+    if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) ||
+        (LastOpcodeType == VMEM && Increment.Named.VM)) {
+      // Insert a NOP to break the clause.
+      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
+          .addImm(0);
+      LastInstWritesM0 = false;
+    }
+
+    if (TII->isSMRD(I->getOpcode()))
+      LastOpcodeType = SMEM;
+    else if (Increment.Named.VM)
+      LastOpcodeType = VMEM;
+  }
+
+  // Remember which export instructions we have seen
+  if (Increment.Named.EXP) {
+    ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
+  }
+
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+
+    MachineOperand &Op = I->getOperand(i);
+    if (!isOpRelevant(Op))
+      continue;
+
+    RegInterval Interval = getRegInterval(Op);
+    for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+      // Remember which registers we define
+      if (Op.isDef())
+        DefinedRegs[j] = LastIssued;
+
+      // and which one we are using
+      if (Op.isUse())
+        UsedRegs[j] = LastIssued;
+    }
+  }
+}
+
+bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I,
+                               const Counters &Required) {
+
+  // End of program? No need to wait on anything
+  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
+    return false;
+
+  // Figure out if the async instructions execute in order
+  bool Ordered[3];
+
+  // VM_CNT is always ordered
+  Ordered[0] = true;
+
+  // EXP_CNT is unordered if we have both EXP & VM-writes
+  Ordered[1] = ExpInstrTypesSeen == 3;
+
+  // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
+  Ordered[2] = false;
+
+  // The values we are going to put into the S_WAITCNT instruction
+  Counters Counts = WaitCounts;
+
+  // Do we really need to wait?
+  bool NeedWait = false;
+
+  for (unsigned i = 0; i < 3; ++i) {
+
+    if (Required.Array[i] <= WaitedOn.Array[i])
+      continue;
+
+    NeedWait = true;
+
+    if (Ordered[i]) {
+      unsigned Value = LastIssued.Array[i] - Required.Array[i];
+
+      // Adjust the value to the real hardware possibilities.
+      Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
+
+    } else
+      Counts.Array[i] = 0;
+
+    // Remember on what we have waited on.
+    WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+  }
+
+  if (!NeedWait)
+    return false;
+
+  // Reset EXP_CNT instruction types
+  if (Counts.Named.EXP == 0)
+    ExpInstrTypesSeen = 0;
+
+  // Build the wait instruction
+  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+          .addImm((Counts.Named.VM & 0xF) |
+                  ((Counts.Named.EXP & 0x7) << 4) |
+                  ((Counts.Named.LGKM & 0x7) << 8));
+
+  LastOpcodeType = OTHER;
+  LastInstWritesM0 = false;
+  return true;
+}
+
+/// \brief helper function for handleOperands
+static void increaseCounters(Counters &Dst, const Counters &Src) {
+
+  for (unsigned i = 0; i < 3; ++i)
+    Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
+}
+
+Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
+
+  Counters Result = ZeroCounts;
+
+  // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
+  // but we also want to wait for any other outstanding transfers before
+  // signalling other hardware blocks
+  if (MI.getOpcode() == AMDGPU::S_SENDMSG)
+    return LastIssued;
+
+  // For each register affected by this
+  // instruction increase the result sequence
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+
+    MachineOperand &Op = MI.getOperand(i);
+    RegInterval Interval = getRegInterval(Op);
+    for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+      if (Op.isDef()) {
+        increaseCounters(Result, UsedRegs[j]);
+        increaseCounters(Result, DefinedRegs[j]);
+      }
+
+      if (Op.isUse())
+        increaseCounters(Result, DefinedRegs[j]);
+    }
+  }
+
+  return Result;
+}
+
+void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) {
+  if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <
+      AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return;
+
+  // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
+  if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
+    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
+    LastInstWritesM0 = false;
+    return;
+  }
+
+  // Set whether this instruction sets M0
+  LastInstWritesM0 = false;
+
+  unsigned NumOperands = I->getNumOperands();
+  for (unsigned i = 0; i < NumOperands; i++) {
+    const MachineOperand &Op = I->getOperand(i);
+
+    if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
+      LastInstWritesM0 = true;
+  }
+}
+
+// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
+// around other non-memory instructions.
+bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
+  bool Changes = false;
+
+  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI =
+      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+
+  MRI = &MF.getRegInfo();
+
+  WaitedOn = ZeroCounts;
+  LastIssued = ZeroCounts;
+  LastOpcodeType = OTHER;
+  LastInstWritesM0 = false;
+
+  memset(&UsedRegs, 0, sizeof(UsedRegs));
+  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+
+      // Wait for everything before a barrier.
+      if (I->getOpcode() == AMDGPU::S_BARRIER)
+        Changes |= insertWait(MBB, I, LastIssued);
+      else
+        Changes |= insertWait(MBB, I, handleOperands(*I));
+
+      pushInstruction(MBB, I);
+      handleSendMsg(MBB, I);
+    }
+
+    // Wait for everything at the end of the MBB
+    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+  }
+
+  return Changes;
+}
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
new file mode 100644
index 00000000000..211666a9bdb
--- /dev/null
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -0,0 +1,673 @@
+//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// SI Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
+    AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+
+  field bits<1> VM_CNT = 0;
+  field bits<1> EXP_CNT = 0;
+  field bits<1> LGKM_CNT = 0;
+
+  field bits<1> SALU = 0;
+  field bits<1> VALU = 0;
+
+  field bits<1> SOP1 = 0;
+  field bits<1> SOP2 = 0;
+  field bits<1> SOPC = 0;
+  field bits<1> SOPK = 0;
+  field bits<1> SOPP = 0;
+
+  field bits<1> VOP1 = 0;
+  field bits<1> VOP2 = 0;
+  field bits<1> VOP3 = 0;
+  field bits<1> VOPC = 0;
+
+  field bits<1> MUBUF = 0;
+  field bits<1> MTBUF = 0;
+  field bits<1> SMRD = 0;
+  field bits<1> DS = 0;
+  field bits<1> MIMG = 0;
+  field bits<1> FLAT = 0;
+  field bits<1> WQM = 0;
+  field bits<1> VGPRSpill = 0;
+
+  // These need to be kept in sync with the enum in SIInstrFlags.
+  let TSFlags{0} = VM_CNT;
+  let TSFlags{1} = EXP_CNT;
+  let TSFlags{2} = LGKM_CNT;
+
+  let TSFlags{3} = SALU;
+  let TSFlags{4} = VALU;
+
+  let TSFlags{5} = SOP1;
+  let TSFlags{6} = SOP2;
+  let TSFlags{7} = SOPC;
+  let TSFlags{8} = SOPK;
+  let TSFlags{9} = SOPP;
+
+  let TSFlags{10} = VOP1;
+  let TSFlags{11} = VOP2;
+  let TSFlags{12} = VOP3;
+  let TSFlags{13} = VOPC;
+
+  let TSFlags{14} = MUBUF;
+  let TSFlags{15} = MTBUF;
+  let TSFlags{16} = SMRD;
+  let TSFlags{17} = DS;
+  let TSFlags{18} = MIMG;
+  let TSFlags{19} = FLAT;
+  let TSFlags{20} = WQM;
+  let TSFlags{21} = VGPRSpill;
+
+  // Most instructions require adjustments after selection to satisfy
+  // operand requirements.
+  let hasPostISelHook = 1;
+  let SchedRW = [Write32Bit];
+}
+
+class Enc32 {
+  field bits<32> Inst;
+  int Size = 4;
+}
+
+class Enc64 {
+  field bits<64> Inst;
+  int Size = 8;
+}
+
+class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
+def VOPDstVCC : VOPDstOperand <VCCReg>;
+
+let Uses = [EXEC] in {
+
+class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VALU = 1;
+}
+
+class VOPCCommon <dag ins, string asm, list<dag> pattern> :
+    VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> {
+
+  let DisableEncoding = "$dst";
+  let VOPC = 1;
+  let Size = 4;
+}
+
+class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> :
+    VOPAnyCommon <outs, ins, asm, pattern> {
+
+  let VOP1 = 1;
+  let Size = 4;
+}
+
+class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> :
+    VOPAnyCommon <outs, ins, asm, pattern> {
+
+  let VOP2 = 1;
+  let Size = 4;
+}
+
+class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
+    VOPAnyCommon <outs, ins, asm, pattern> {
+
+  // Using complex patterns gives VOP3 patterns a very high complexity rating,
+  // but standalone patterns are almost always prefered, so we need to adjust the
+  // priority lower.  The goal is to use a high number to reduce complexity to
+  // zero (or less than zero).
+  let AddedComplexity = -1000;
+
+  let VOP3 = 1;
+  let VALU = 1;
+
+  let AsmMatchConverter = "cvtVOP3";
+  let isCodeGenOnly = 0;
+
+  int Size = 8;
+}
+
+} // End Uses = [EXEC]
+
+//===----------------------------------------------------------------------===//
+// Scalar operations
+//===----------------------------------------------------------------------===//
+
+class SOP1e <bits<8> op> : Enc32 {
+  bits<7> sdst;
+  bits<8> ssrc0;
+
+  let Inst{7-0} = ssrc0;
+  let Inst{15-8} = op;
+  let Inst{22-16} = sdst;
+  let Inst{31-23} = 0x17d; //encoding;
+}
+
+class SOP2e <bits<7> op> : Enc32 {
+  bits<7> sdst;
+  bits<8> ssrc0;
+  bits<8> ssrc1;
+
+  let Inst{7-0} = ssrc0;
+  let Inst{15-8} = ssrc1;
+  let Inst{22-16} = sdst;
+  let Inst{29-23} = op;
+  let Inst{31-30} = 0x2; // encoding
+}
+
+class SOPCe <bits<7> op> : Enc32 {
+  bits<8> ssrc0;
+  bits<8> ssrc1;
+
+  let Inst{7-0} = ssrc0;
+  let Inst{15-8} = ssrc1;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17e;
+}
+
+class SOPKe <bits<5> op> : Enc32 {
+  bits <7> sdst;
+  bits <16> simm16;
+
+  let Inst{15-0} = simm16;
+  let Inst{22-16} = sdst;
+  let Inst{27-23} = op;
+  let Inst{31-28} = 0xb; //encoding
+}
+
+class SOPK64e <bits<5> op> : Enc64 {
+  bits <7> sdst = 0;
+  bits <16> simm16;
+  bits <32> imm;
+
+  let Inst{15-0} = simm16;
+  let Inst{22-16} = sdst;
+  let Inst{27-23} = op;
+  let Inst{31-28} = 0xb;
+
+  let Inst{63-32} = imm;
+}
+
+class SOPPe <bits<7> op> : Enc32 {
+  bits <16> simm16;
+
+  let Inst{15-0} = simm16;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17f; // encoding
+}
+
+class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
+  bits<7> sdst;
+  bits<7> sbase;
+  bits<8> offset;
+
+  let Inst{7-0} = offset;
+  let Inst{8} = imm;
+  let Inst{14-9} = sbase{6-1};
+  let Inst{21-15} = sdst;
+  let Inst{26-22} = op;
+  let Inst{31-27} = 0x18; //encoding
+}
+
+let SchedRW = [WriteSALU] in {
+class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let isCodeGenOnly = 0;
+  let SALU = 1;
+  let SOP1 = 1;
+}
+
+class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let isCodeGenOnly = 0;
+  let SALU = 1;
+  let SOP2 = 1;
+
+  let UseNamedOperandTable = 1;
+}
+
+class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+  InstSI<outs, ins, asm, pattern>, SOPCe <op> {
+
+  let DisableEncoding = "$dst";
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+  let SOPC = 1;
+  let isCodeGenOnly = 0;
+
+  let UseNamedOperandTable = 1;
+}
+
+class SOPK <dag outs, dag ins, string asm, list<dag> pattern> :
+   InstSI <outs, ins , asm, pattern> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+  let SOPK = 1;
+
+  let UseNamedOperandTable = 1;
+}
+
+class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
+		InstSI <(outs), ins, asm, pattern >, SOPPe <op> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+  let SOPP = 1;
+
+  let UseNamedOperandTable = 1;
+}
+
+} // let SchedRW = [WriteSALU]
+
+class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern> {
+
+  let LGKM_CNT = 1;
+  let SMRD = 1;
+  let mayStore = 0;
+  let mayLoad = 1;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let SchedRW = [WriteSMEM];
+}
+
+//===----------------------------------------------------------------------===//
+// Vector ALU operations
+//===----------------------------------------------------------------------===//
+
+class VOP1e <bits<8> op> : Enc32 {
+  bits<8> vdst;
+  bits<9> src0;
+
+  let Inst{8-0} = src0;
+  let Inst{16-9} = op;
+  let Inst{24-17} = vdst;
+  let Inst{31-25} = 0x3f; //encoding
+}
+
+class VOP2e <bits<6> op> : Enc32 {
+  bits<8> vdst;
+  bits<9> src0;
+  bits<8> src1;
+
+  let Inst{8-0} = src0;
+  let Inst{16-9} = src1;
+  let Inst{24-17} = vdst;
+  let Inst{30-25} = op;
+  let Inst{31} = 0x0; //encoding
+}
+
+class VOP2_MADKe <bits<6> op> : Enc64 {
+
+  bits<8>  vdst;
+  bits<9>  src0;
+  bits<8>  vsrc1;
+  bits<32> src2;
+
+  let Inst{8-0} = src0;
+  let Inst{16-9} = vsrc1;
+  let Inst{24-17} = vdst;
+  let Inst{30-25} = op;
+  let Inst{31} = 0x0; // encoding
+  let Inst{63-32} = src2;
+}
+
+class VOP3e <bits<9> op> : Enc64 {
+  bits<8> vdst;
+  bits<2> src0_modifiers;
+  bits<9> src0;
+  bits<2> src1_modifiers;
+  bits<9> src1;
+  bits<2> src2_modifiers;
+  bits<9> src2;
+  bits<1> clamp;
+  bits<2> omod;
+
+  let Inst{7-0} = vdst;
+  let Inst{8} = src0_modifiers{1};
+  let Inst{9} = src1_modifiers{1};
+  let Inst{10} = src2_modifiers{1};
+  let Inst{11} = clamp;
+  let Inst{25-17} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
+}
+
+class VOP3be <bits<9> op> : Enc64 {
+  bits<8> vdst;
+  bits<2> src0_modifiers;
+  bits<9> src0;
+  bits<2> src1_modifiers;
+  bits<9> src1;
+  bits<2> src2_modifiers;
+  bits<9> src2;
+  bits<7> sdst;
+  bits<2> omod;
+
+  let Inst{7-0} = vdst;
+  let Inst{14-8} = sdst;
+  let Inst{25-17} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
+}
+
+class VOPCe <bits<8> op> : Enc32 {
+  bits<9> src0;
+  bits<8> vsrc1;
+
+  let Inst{8-0} = src0;
+  let Inst{16-9} = vsrc1;
+  let Inst{24-17} = op;
+  let Inst{31-25} = 0x3e;
+}
+
+class VINTRPe <bits<2> op> : Enc32 {
+  bits<8> vdst;
+  bits<8> vsrc;
+  bits<2> attrchan;
+  bits<6> attr;
+
+  let Inst{7-0} = vsrc;
+  let Inst{9-8} = attrchan;
+  let Inst{15-10} = attr;
+  let Inst{17-16} = op;
+  let Inst{25-18} = vdst;
+  let Inst{31-26} = 0x32; // encoding
+}
+
+class DSe <bits<8> op> : Enc64 {
+  bits<8> vdst;
+  bits<1> gds;
+  bits<8> addr;
+  bits<8> data0;
+  bits<8> data1;
+  bits<8> offset0;
+  bits<8> offset1;
+
+  let Inst{7-0} = offset0;
+  let Inst{15-8} = offset1;
+  let Inst{17} = gds;
+  let Inst{25-18} = op;
+  let Inst{31-26} = 0x36; //encoding
+  let Inst{39-32} = addr;
+  let Inst{47-40} = data0;
+  let Inst{55-48} = data1;
+  let Inst{63-56} = vdst;
+}
+
+class MUBUFe <bits<7> op> : Enc64 {
+  bits<12> offset;
+  bits<1> offen;
+  bits<1> idxen;
+  bits<1> glc;
+  bits<1> addr64;
+  bits<1> lds;
+  bits<8> vaddr;
+  bits<8> vdata;
+  bits<7> srsrc;
+  bits<1> slc;
+  bits<1> tfe;
+  bits<8> soffset;
+
+  let Inst{11-0} = offset;
+  let Inst{12} = offen;
+  let Inst{13} = idxen;
+  let Inst{14} = glc;
+  let Inst{15} = addr64;
+  let Inst{16} = lds;
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x38; //encoding
+  let Inst{39-32} = vaddr;
+  let Inst{47-40} = vdata;
+  let Inst{52-48} = srsrc{6-2};
+  let Inst{54} = slc;
+  let Inst{55} = tfe;
+  let Inst{63-56} = soffset;
+}
+
+class MTBUFe <bits<3> op> : Enc64 {
+  bits<8> vdata;
+  bits<12> offset;
+  bits<1> offen;
+  bits<1> idxen;
+  bits<1> glc;
+  bits<1> addr64;
+  bits<4> dfmt;
+  bits<3> nfmt;
+  bits<8> vaddr;
+  bits<7> srsrc;
+  bits<1> slc;
+  bits<1> tfe;
+  bits<8> soffset;
+
+  let Inst{11-0} = offset;
+  let Inst{12} = offen;
+  let Inst{13} = idxen;
+  let Inst{14} = glc;
+  let Inst{15} = addr64;
+  let Inst{18-16} = op;
+  let Inst{22-19} = dfmt;
+  let Inst{25-23} = nfmt;
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = vaddr;
+  let Inst{47-40} = vdata;
+  let Inst{52-48} = srsrc{6-2};
+  let Inst{54} = slc;
+  let Inst{55} = tfe;
+  let Inst{63-56} = soffset;
+}
+
+class MIMGe <bits<7> op> : Enc64 {
+  bits<8> vdata;
+  bits<4> dmask;
+  bits<1> unorm;
+  bits<1> glc;
+  bits<1> da;
+  bits<1> r128;
+  bits<1> tfe;
+  bits<1> lwe;
+  bits<1> slc;
+  bits<8> vaddr;
+  bits<7> srsrc;
+  bits<7> ssamp;
+
+  let Inst{11-8} = dmask;
+  let Inst{12} = unorm;
+  let Inst{13} = glc;
+  let Inst{14} = da;
+  let Inst{15} = r128;
+  let Inst{16} = tfe;
+  let Inst{17} = lwe;
+  let Inst{24-18} = op;
+  let Inst{25} = slc;
+  let Inst{31-26} = 0x3c;
+  let Inst{39-32} = vaddr;
+  let Inst{47-40} = vdata;
+  let Inst{52-48} = srsrc{6-2};
+  let Inst{57-53} = ssamp{6-2};
+}
+
+class FLATe<bits<7> op> : Enc64 {
+  bits<8> addr;
+  bits<8> data;
+  bits<8> vdst;
+  bits<1> slc;
+  bits<1> glc;
+  bits<1> tfe;
+
+  // 15-0 is reserved.
+  let Inst{16} = glc;
+  let Inst{17} = slc;
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x37; // Encoding.
+  let Inst{39-32} = addr;
+  let Inst{47-40} = data;
+  // 54-48 is reserved.
+  let Inst{55} = tfe;
+  let Inst{63-56} = vdst;
+}
+
+class EXPe : Enc64 {
+  bits<4> en;
+  bits<6> tgt;
+  bits<1> compr;
+  bits<1> done;
+  bits<1> vm;
+  bits<8> vsrc0;
+  bits<8> vsrc1;
+  bits<8> vsrc2;
+  bits<8> vsrc3;
+
+  let Inst{3-0} = en;
+  let Inst{9-4} = tgt;
+  let Inst{10} = compr;
+  let Inst{11} = done;
+  let Inst{12} = vm;
+  let Inst{31-26} = 0x3e;
+  let Inst{39-32} = vsrc0;
+  let Inst{47-40} = vsrc1;
+  let Inst{55-48} = vsrc2;
+  let Inst{63-56} = vsrc3;
+}
+
+let Uses = [EXEC] in {
+
+class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    VOP1Common <outs, ins, asm, pattern>,
+    VOP1e<op> {
+  let isCodeGenOnly = 0;
+}
+
+class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    VOP2Common <outs, ins, asm, pattern>, VOP2e<op> {
+  let isCodeGenOnly = 0;
+}
+
+class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
+    VOPCCommon <ins, asm, pattern>, VOPCe <op>;
+
+class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+} // End Uses = [EXEC]
+
+//===----------------------------------------------------------------------===//
+// Vector I/O operations
+//===----------------------------------------------------------------------===//
+
+let Uses = [EXEC] in {
+
+class DS <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+
+  let LGKM_CNT = 1;
+  let DS = 1;
+  let UseNamedOperandTable = 1;
+  let Uses = [M0];
+
+  // Most instruction load and store data, so set this as the default.
+  let mayLoad = 1;
+  let mayStore = 1;
+
+  let hasSideEffects = 0;
+  let AsmMatchConverter = "cvtDS";
+  let SchedRW = [WriteLDS];
+}
+
+class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern> {
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+  let MUBUF = 1;
+
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let AsmMatchConverter = "cvtMubuf";
+  let SchedRW = [WriteVMEM];
+}
+
+class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern> {
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+  let MTBUF = 1;
+
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let SchedRW = [WriteVMEM];
+}
+
+class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern>, FLATe <op> {
+  let FLAT = 1;
+  // Internally, FLAT instruction are executed as both an LDS and a
+  // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
+  // and are not considered done until both have been decremented.
+  let VM_CNT = 1;
+  let LGKM_CNT = 1;
+
+  let Uses = [EXEC, FLAT_SCR]; // M0
+
+  let UseNamedOperandTable = 1;
+  let hasSideEffects = 0;
+  let AsmMatchConverter = "cvtFlat";
+  let SchedRW = [WriteVMEM];
+}
+
+class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern>, MIMGe <op> {
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+  let MIMG = 1;
+
+  let hasSideEffects = 0; // XXX ????
+}
+
+
+} // End Uses = [EXEC]
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
new file mode 100644
index 00000000000..d647c25286f
--- /dev/null
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -0,0 +1,2723 @@
+//===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Implementation of TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "SIInstrInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "SIDefines.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
+    : AMDGPUInstrInfo(st), RI() {}
+
+//===----------------------------------------------------------------------===//
+// TargetInstrInfo callbacks
+//===----------------------------------------------------------------------===//
+
+static unsigned getNumOperandsNoGlue(SDNode *Node) {
+  unsigned N = Node->getNumOperands();
+  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
+    --N;
+  return N;
+}
+
+static SDValue findChainOperand(SDNode *Load) {
+  SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
+  assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
+  return LastOp;
+}
+
+/// \brief Returns true if both nodes have the same value for the given
+///        operand \p Op, or if both nodes do not have this operand.
+static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
+  unsigned Opc0 = N0->getMachineOpcode();
+  unsigned Opc1 = N1->getMachineOpcode();
+
+  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
+  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
+
+  if (Op0Idx == -1 && Op1Idx == -1)
+    return true;
+
+
+  if ((Op0Idx == -1 && Op1Idx != -1) ||
+      (Op1Idx == -1 && Op0Idx != -1))
+    return false;
+
+  // getNamedOperandIdx returns the index for the MachineInstr's operands,
+  // which includes the result as the first operand. We are indexing into the
+  // MachineSDNode's operands, so we need to skip the result operand to get
+  // the real index.
+  --Op0Idx;
+  --Op1Idx;
+
+  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
+}
+
+bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
+                                                    AliasAnalysis *AA) const {
+  // TODO: The generic check fails for VALU instructions that should be
+  // rematerializable due to implicit reads of exec. We really want all of the
+  // generic logic for this except for this.
+  switch (MI->getOpcode()) {
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
+                                          int64_t &Offset0,
+                                          int64_t &Offset1) const {
+  if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
+    return false;
+
+  unsigned Opc0 = Load0->getMachineOpcode();
+  unsigned Opc1 = Load1->getMachineOpcode();
+
+  // Make sure both are actually loads.
+  if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
+    return false;
+
+  if (isDS(Opc0) && isDS(Opc1)) {
+
+    // FIXME: Handle this case:
+    if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
+      return false;
+
+    // Check base reg.
+    if (Load0->getOperand(1) != Load1->getOperand(1))
+      return false;
+
+    // Check chain.
+    if (findChainOperand(Load0) != findChainOperand(Load1))
+      return false;
+
+    // Skip read2 / write2 variants for simplicity.
+    // TODO: We should report true if the used offsets are adjacent (excluded
+    // st64 versions).
+    if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
+        AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
+      return false;
+
+    Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
+    Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
+    return true;
+  }
+
+  if (isSMRD(Opc0) && isSMRD(Opc1)) {
+    assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
+
+    // Check base reg.
+    if (Load0->getOperand(0) != Load1->getOperand(0))
+      return false;
+
+    const ConstantSDNode *Load0Offset =
+        dyn_cast<ConstantSDNode>(Load0->getOperand(1));
+    const ConstantSDNode *Load1Offset =
+        dyn_cast<ConstantSDNode>(Load1->getOperand(1));
+
+    if (!Load0Offset || !Load1Offset)
+      return false;
+
+    // Check chain.
+    if (findChainOperand(Load0) != findChainOperand(Load1))
+      return false;
+
+    Offset0 = Load0Offset->getZExtValue();
+    Offset1 = Load1Offset->getZExtValue();
+    return true;
+  }
+
+  // MUBUF and MTBUF can access the same addresses.
+  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
+
+    // MUBUF and MTBUF have vaddr at different indices.
+    if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
+        findChainOperand(Load0) != findChainOperand(Load1) ||
+        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
+        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
+      return false;
+
+    int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
+    int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
+
+    if (OffIdx0 == -1 || OffIdx1 == -1)
+      return false;
+
+    // getNamedOperandIdx returns the index for MachineInstrs.  Since they
+    // inlcude the output in the operand list, but SDNodes don't, we need to
+    // subtract the index by one.
+    --OffIdx0;
+    --OffIdx1;
+
+    SDValue Off0 = Load0->getOperand(OffIdx0);
+    SDValue Off1 = Load1->getOperand(OffIdx1);
+
+    // The offset might be a FrameIndexSDNode.
+    if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
+      return false;
+
+    Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
+    Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
+    return true;
+  }
+
+  return false;
+}
+
+static bool isStride64(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::DS_READ2ST64_B32:
+  case AMDGPU::DS_READ2ST64_B64:
+  case AMDGPU::DS_WRITE2ST64_B32:
+  case AMDGPU::DS_WRITE2ST64_B64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool SIInstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt,
+                                       unsigned &BaseReg, unsigned &Offset,
+                                       const TargetRegisterInfo *TRI) const {
+  unsigned Opc = LdSt->getOpcode();
+  if (isDS(Opc)) {
+    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
+                                                      AMDGPU::OpName::offset);
+    if (OffsetImm) {
+      // Normal, single offset LDS instruction.
+      const MachineOperand *AddrReg = getNamedOperand(*LdSt,
+                                                      AMDGPU::OpName::addr);
+
+      BaseReg = AddrReg->getReg();
+      Offset = OffsetImm->getImm();
+      return true;
+    }
+
+    // The 2 offset instructions use offset0 and offset1 instead. We can treat
+    // these as a load with a single offset if the 2 offsets are consecutive. We
+    // will use this for some partially aligned loads.
+    const MachineOperand *Offset0Imm = getNamedOperand(*LdSt,
+                                                       AMDGPU::OpName::offset0);
+    const MachineOperand *Offset1Imm = getNamedOperand(*LdSt,
+                                                       AMDGPU::OpName::offset1);
+
+    uint8_t Offset0 = Offset0Imm->getImm();
+    uint8_t Offset1 = Offset1Imm->getImm();
+    assert(Offset1 > Offset0);
+
+    if (Offset1 - Offset0 == 1) {
+      // Each of these offsets is in element sized units, so we need to convert
+      // to bytes of the individual reads.
+
+      unsigned EltSize;
+      if (LdSt->mayLoad())
+        EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2;
+      else {
+        assert(LdSt->mayStore());
+        int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+        EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize();
+      }
+
+      if (isStride64(Opc))
+        EltSize *= 64;
+
+      const MachineOperand *AddrReg = getNamedOperand(*LdSt,
+                                                      AMDGPU::OpName::addr);
+      BaseReg = AddrReg->getReg();
+      Offset = EltSize * Offset0;
+      return true;
+    }
+
+    return false;
+  }
+
+  if (isMUBUF(Opc) || isMTBUF(Opc)) {
+    if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1)
+      return false;
+
+    const MachineOperand *AddrReg = getNamedOperand(*LdSt,
+                                                    AMDGPU::OpName::vaddr);
+    if (!AddrReg)
+      return false;
+
+    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
+                                                      AMDGPU::OpName::offset);
+    BaseReg = AddrReg->getReg();
+    Offset = OffsetImm->getImm();
+    return true;
+  }
+
+  if (isSMRD(Opc)) {
+    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
+                                                      AMDGPU::OpName::offset);
+    if (!OffsetImm)
+      return false;
+
+    const MachineOperand *SBaseReg = getNamedOperand(*LdSt,
+                                                     AMDGPU::OpName::sbase);
+    BaseReg = SBaseReg->getReg();
+    Offset = OffsetImm->getImm();
+    return true;
+  }
+
+  return false;
+}
+
+bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
+                                     MachineInstr *SecondLdSt,
+                                     unsigned NumLoads) const {
+  unsigned Opc0 = FirstLdSt->getOpcode();
+  unsigned Opc1 = SecondLdSt->getOpcode();
+
+  // TODO: This needs finer tuning
+  if (NumLoads > 4)
+    return false;
+
+  if (isDS(Opc0) && isDS(Opc1))
+    return true;
+
+  if (isSMRD(Opc0) && isSMRD(Opc1))
+    return true;
+
+  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1)))
+    return true;
+
+  return false;
+}
+
+void
+SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MI, DebugLoc DL,
+                         unsigned DestReg, unsigned SrcReg,
+                         bool KillSrc) const {
+
+  // If we are trying to copy to or from SCC, there is a bug somewhere else in
+  // the backend.  While it may be theoretically possible to do this, it should
+  // never be necessary.
+  assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
+
+  static const int16_t Sub0_15[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+    AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0
+  };
+
+  static const int16_t Sub0_7[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0
+  };
+
+  static const int16_t Sub0_3[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0
+  };
+
+  static const int16_t Sub0_2[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0
+  };
+
+  static const int16_t Sub0_1[] = {
+    AMDGPU::sub0, AMDGPU::sub1, 0
+  };
+
+  unsigned Opcode;
+  const int16_t *SubIndices;
+
+  if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+
+  } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
+    if (DestReg == AMDGPU::VCC) {
+      if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      } else {
+        // FIXME: Hack until VReg_1 removed.
+        assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
+        BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC)
+          .addImm(0)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      }
+
+      return;
+    }
+
+    assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+
+  } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
+    Opcode = AMDGPU::S_MOV_B32;
+    SubIndices = Sub0_3;
+
+  } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
+    Opcode = AMDGPU::S_MOV_B32;
+    SubIndices = Sub0_7;
+
+  } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_512RegClass.contains(SrcReg));
+    Opcode = AMDGPU::S_MOV_B32;
+    SubIndices = Sub0_15;
+
+  } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) {
+    assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
+           AMDGPU::SReg_32RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+
+  } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
+           AMDGPU::SReg_64RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_1;
+
+  } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_96RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_2;
+
+  } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_128RegClass.contains(SrcReg) ||
+           AMDGPU::SReg_128RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_3;
+
+  } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_256RegClass.contains(SrcReg) ||
+           AMDGPU::SReg_256RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_7;
+
+  } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_512RegClass.contains(SrcReg) ||
+           AMDGPU::SReg_512RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_15;
+
+  } else {
+    llvm_unreachable("Can't copy register!");
+  }
+
+  while (unsigned SubIdx = *SubIndices++) {
+    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
+      get(Opcode), RI.getSubReg(DestReg, SubIdx));
+
+    Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc));
+
+    if (*SubIndices)
+      Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
+  }
+}
+
+unsigned SIInstrInfo::commuteOpcode(const MachineInstr &MI) const {
+  const unsigned Opcode = MI.getOpcode();
+
+  int NewOpc;
+
+  // Try to map original to commuted opcode
+  NewOpc = AMDGPU::getCommuteRev(Opcode);
+  // Check if the commuted (REV) opcode exists on the target.
+  if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1)
+    return NewOpc;
+
+  // Try to map commuted to original opcode
+  NewOpc = AMDGPU::getCommuteOrig(Opcode);
+  // Check if the original (non-REV) opcode exists on the target.
+  if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1)
+    return NewOpc;
+
+  return Opcode;
+}
+
+unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
+
+  if (DstRC->getSize() == 4) {
+    return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+  } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
+    return AMDGPU::S_MOV_B64;
+  } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) {
+    return  AMDGPU::V_MOV_B64_PSEUDO;
+  }
+  return AMDGPU::COPY;
+}
+
+void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MI,
+                                      unsigned SrcReg, bool isKill,
+                                      int FrameIndex,
+                                      const TargetRegisterClass *RC,
+                                      const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  int Opcode = -1;
+
+  if (RI.isSGPRClass(RC)) {
+    // We are only allowed to create one new instruction when spilling
+    // registers, so we need to use pseudo instruction for spilling
+    // SGPRs.
+    switch (RC->getSize() * 8) {
+      case 32:  Opcode = AMDGPU::SI_SPILL_S32_SAVE;  break;
+      case 64:  Opcode = AMDGPU::SI_SPILL_S64_SAVE;  break;
+      case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
+      case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
+      case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
+    }
+  } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
+    MFI->setHasSpilledVGPRs();
+
+    switch(RC->getSize() * 8) {
+      case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
+      case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
+      case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break;
+      case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break;
+      case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break;
+      case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break;
+    }
+  }
+
+  if (Opcode != -1) {
+    FrameInfo->setObjectAlignment(FrameIndex, 4);
+    BuildMI(MBB, MI, DL, get(Opcode))
+            .addReg(SrcReg)
+            .addFrameIndex(FrameIndex)
+            // Place-holder registers, these will be filled in by
+            // SIPrepareScratchRegs.
+            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
+            .addReg(AMDGPU::SGPR0, RegState::Undef);
+  } else {
+    LLVMContext &Ctx = MF->getFunction()->getContext();
+    Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
+                  " spill register");
+    BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
+            .addReg(SrcReg);
+  }
+}
+
+void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       unsigned DestReg, int FrameIndex,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  int Opcode = -1;
+
+  if (RI.isSGPRClass(RC)){
+    switch(RC->getSize() * 8) {
+      case 32:  Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
+      case 64:  Opcode = AMDGPU::SI_SPILL_S64_RESTORE;  break;
+      case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
+      case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
+      case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
+    }
+  } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
+    switch(RC->getSize() * 8) {
+      case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
+      case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
+      case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break;
+      case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break;
+      case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break;
+      case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break;
+    }
+  }
+
+  if (Opcode != -1) {
+    FrameInfo->setObjectAlignment(FrameIndex, 4);
+    BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+            .addFrameIndex(FrameIndex)
+            // Place-holder registers, these will be filled in by
+            // SIPrepareScratchRegs.
+            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
+            .addReg(AMDGPU::SGPR0, RegState::Undef);
+
+  } else {
+    LLVMContext &Ctx = MF->getFunction()->getContext();
+    Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
+                  " restore register");
+    BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
+  }
+}
+
+/// \param @Offset Offset in bytes of the FrameIndex being spilled
+unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MI,
+                                               RegScavenger *RS, unsigned TmpReg,
+                                               unsigned FrameOffset,
+                                               unsigned Size) const {
+  MachineFunction *MF = MBB.getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>();
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo*>(ST.getRegisterInfo());
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF);
+  unsigned WavefrontSize = ST.getWavefrontSize();
+
+  unsigned TIDReg = MFI->getTIDReg();
+  if (!MFI->hasCalculatedTID()) {
+    MachineBasicBlock &Entry = MBB.getParent()->front();
+    MachineBasicBlock::iterator Insert = Entry.front();
+    DebugLoc DL = Insert->getDebugLoc();
+
+    TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass);
+    if (TIDReg == AMDGPU::NoRegister)
+      return TIDReg;
+
+
+    if (MFI->getShaderType() == ShaderType::COMPUTE &&
+        WorkGroupSize > WavefrontSize) {
+
+      unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X);
+      unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y);
+      unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z);
+      unsigned InputPtrReg =
+          TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR);
+      for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
+        if (!Entry.isLiveIn(Reg))
+          Entry.addLiveIn(Reg);
+      }
+
+      RS->enterBasicBlock(&Entry);
+      unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+      unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+      BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
+              .addReg(InputPtrReg)
+              .addImm(SI::KernelInputOffsets::NGROUPS_Z);
+      BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
+              .addReg(InputPtrReg)
+              .addImm(SI::KernelInputOffsets::NGROUPS_Y);
+
+      // NGROUPS.X * NGROUPS.Y
+      BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
+              .addReg(STmp1)
+              .addReg(STmp0);
+      // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
+              .addReg(STmp1)
+              .addReg(TIDIGXReg);
+      // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
+              .addReg(STmp0)
+              .addReg(TIDIGYReg)
+              .addReg(TIDReg);
+      // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
+              .addReg(TIDReg)
+              .addReg(TIDIGZReg);
+    } else {
+      // Get the wave id
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
+              TIDReg)
+              .addImm(-1)
+              .addImm(0);
+
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
+              TIDReg)
+              .addImm(-1)
+              .addReg(TIDReg);
+    }
+
+    BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
+            TIDReg)
+            .addImm(2)
+            .addReg(TIDReg);
+    MFI->setTIDReg(TIDReg);
+  }
+
+  // Add FrameIndex to LDS offset
+  unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize);
+  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
+          .addImm(LDSOffset)
+          .addReg(TIDReg);
+
+  return TmpReg;
+}
+
+void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
+                             int Count) const {
+  while (Count > 0) {
+    int Arg;
+    if (Count >= 8)
+      Arg = 7;
+    else
+      Arg = Count - 1;
+    Count -= 8;
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP))
+            .addImm(Arg);
+  }
+}
+
+bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  switch (MI->getOpcode()) {
+  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+
+  case AMDGPU::SI_CONSTDATA_PTR: {
+    unsigned Reg = MI->getOperand(0).getReg();
+    unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
+    unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
+
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg);
+
+    // Add 32-bit offset from this instruction to the start of the constant data.
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo)
+            .addReg(RegLo)
+            .addTargetIndex(AMDGPU::TI_CONSTDATA_START)
+            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit);
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+            .addReg(RegHi)
+            .addImm(0)
+            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit)
+            .addReg(AMDGPU::SCC, RegState::Implicit);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::SGPR_USE:
+    // This is just a placeholder for register allocation.
+    MI->eraseFromParent();
+    break;
+
+  case AMDGPU::V_MOV_B64_PSEUDO: {
+    unsigned Dst = MI->getOperand(0).getReg();
+    unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
+    unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+
+    const MachineOperand &SrcOp = MI->getOperand(1);
+    // FIXME: Will this work for 64-bit floating point immediates?
+    assert(!SrcOp.isFPImm());
+    if (SrcOp.isImm()) {
+      APInt Imm(64, SrcOp.getImm());
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+              .addImm(Imm.getLoBits(32).getZExtValue())
+              .addReg(Dst, RegState::Implicit);
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+              .addImm(Imm.getHiBits(32).getZExtValue())
+              .addReg(Dst, RegState::Implicit);
+    } else {
+      assert(SrcOp.isReg());
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+              .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
+              .addReg(Dst, RegState::Implicit);
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+              .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
+              .addReg(Dst, RegState::Implicit);
+    }
+    MI->eraseFromParent();
+    break;
+  }
+
+  case AMDGPU::V_CNDMASK_B64_PSEUDO: {
+    unsigned Dst = MI->getOperand(0).getReg();
+    unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
+    unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+    unsigned Src0 = MI->getOperand(1).getReg();
+    unsigned Src1 = MI->getOperand(2).getReg();
+    const MachineOperand &SrcCond = MI->getOperand(3);
+
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
+        .addReg(RI.getSubReg(Src0, AMDGPU::sub0))
+        .addReg(RI.getSubReg(Src1, AMDGPU::sub0))
+        .addOperand(SrcCond);
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
+        .addReg(RI.getSubReg(Src0, AMDGPU::sub1))
+        .addReg(RI.getSubReg(Src1, AMDGPU::sub1))
+        .addOperand(SrcCond);
+    MI->eraseFromParent();
+    break;
+  }
+  }
+  return true;
+}
+
+MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
+                                              bool NewMI) const {
+
+  if (MI->getNumOperands() < 3)
+    return nullptr;
+
+  int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src0);
+  assert(Src0Idx != -1 && "Should always have src0 operand");
+
+  MachineOperand &Src0 = MI->getOperand(Src0Idx);
+  if (!Src0.isReg())
+    return nullptr;
+
+  int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src1);
+  if (Src1Idx == -1)
+    return nullptr;
+
+  MachineOperand &Src1 = MI->getOperand(Src1Idx);
+
+  // Make sure it's legal to commute operands for VOP2.
+  if (isVOP2(MI->getOpcode()) &&
+      (!isOperandLegal(MI, Src0Idx, &Src1) ||
+       !isOperandLegal(MI, Src1Idx, &Src0))) {
+    return nullptr;
+  }
+
+  if (!Src1.isReg()) {
+    // Allow commuting instructions with Imm operands.
+    if (NewMI || !Src1.isImm() ||
+       (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
+      return nullptr;
+    }
+
+    // Be sure to copy the source modifiers to the right place.
+    if (MachineOperand *Src0Mods
+          = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
+      MachineOperand *Src1Mods
+        = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers);
+
+      int Src0ModsVal = Src0Mods->getImm();
+      if (!Src1Mods && Src0ModsVal != 0)
+        return nullptr;
+
+      // XXX - This assert might be a lie. It might be useful to have a neg
+      // modifier with 0.0.
+      int Src1ModsVal = Src1Mods->getImm();
+      assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates");
+
+      Src1Mods->setImm(Src0ModsVal);
+      Src0Mods->setImm(Src1ModsVal);
+    }
+
+    unsigned Reg = Src0.getReg();
+    unsigned SubReg = Src0.getSubReg();
+    if (Src1.isImm())
+      Src0.ChangeToImmediate(Src1.getImm());
+    else
+      llvm_unreachable("Should only have immediates");
+
+    Src1.ChangeToRegister(Reg, false);
+    Src1.setSubReg(SubReg);
+  } else {
+    MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+  }
+
+  if (MI)
+    MI->setDesc(get(commuteOpcode(*MI)));
+
+  return MI;
+}
+
+// This needs to be implemented because the source modifiers may be inserted
+// between the true commutable operands, and the base
+// TargetInstrInfo::commuteInstruction uses it.
+bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
+                                        unsigned &SrcOpIdx1,
+                                        unsigned &SrcOpIdx2) const {
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (!MCID.isCommutable())
+    return false;
+
+  unsigned Opc = MI->getOpcode();
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+  if (Src0Idx == -1)
+    return false;
+
+  // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
+  // immediate.
+  if (!MI->getOperand(Src0Idx).isReg())
+    return false;
+
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  if (Src1Idx == -1)
+    return false;
+
+  if (!MI->getOperand(Src1Idx).isReg())
+    return false;
+
+  // If any source modifiers are set, the generic instruction commuting won't
+  // understand how to copy the source modifiers.
+  if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
+      hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+    return false;
+
+  SrcOpIdx1 = Src0Idx;
+  SrcOpIdx2 = Src1Idx;
+  return true;
+}
+
+MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned DstReg,
+                                         unsigned SrcReg) const {
+  return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32),
+                 DstReg) .addReg(SrcReg);
+}
+
+bool SIInstrInfo::isMov(unsigned Opcode) const {
+  switch(Opcode) {
+  default: return false;
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+    return true;
+  }
+}
+
+bool
+SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  return RC != &AMDGPU::EXECRegRegClass;
+}
+
+static void removeModOperands(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+                                              AMDGPU::OpName::src0_modifiers);
+  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+                                              AMDGPU::OpName::src1_modifiers);
+  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+                                              AMDGPU::OpName::src2_modifiers);
+
+  MI.RemoveOperand(Src2ModIdx);
+  MI.RemoveOperand(Src1ModIdx);
+  MI.RemoveOperand(Src0ModIdx);
+}
+
+bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+                                unsigned Reg, MachineRegisterInfo *MRI) const {
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return false;
+
+  unsigned Opc = UseMI->getOpcode();
+  if (Opc == AMDGPU::V_MAD_F32) {
+    // Don't fold if we are using source modifiers. The new VOP2 instructions
+    // don't have them.
+    if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) ||
+        hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) ||
+        hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) {
+      return false;
+    }
+
+    MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0);
+    MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1);
+    MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2);
+
+    // Multiplied part is the constant: Use v_madmk_f32
+    // We should only expect these to be on src0 due to canonicalizations.
+    if (Src0->isReg() && Src0->getReg() == Reg) {
+      if (!Src1->isReg() ||
+          (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+        return false;
+
+      if (!Src2->isReg() ||
+          (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))))
+        return false;
+
+      // We need to do some weird looking operand shuffling since the madmk
+      // operands are out of the normal expected order with the multiplied
+      // constant as the last operand.
+      //
+      // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1
+      // src0 -> src2 K
+      // src1 -> src0
+      // src2 -> src1
+
+      const int64_t Imm = DefMI->getOperand(1).getImm();
+
+      // FIXME: This would be a lot easier if we could return a new instruction
+      // instead of having to modify in place.
+
+      // Remove these first since they are at the end.
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+                                                      AMDGPU::OpName::omod));
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+                                                      AMDGPU::OpName::clamp));
+
+      unsigned Src1Reg = Src1->getReg();
+      unsigned Src1SubReg = Src1->getSubReg();
+      unsigned Src2Reg = Src2->getReg();
+      unsigned Src2SubReg = Src2->getSubReg();
+      Src0->setReg(Src1Reg);
+      Src0->setSubReg(Src1SubReg);
+      Src0->setIsKill(Src1->isKill());
+
+      Src1->setReg(Src2Reg);
+      Src1->setSubReg(Src2SubReg);
+      Src1->setIsKill(Src2->isKill());
+
+      Src2->ChangeToImmediate(Imm);
+
+      removeModOperands(*UseMI);
+      UseMI->setDesc(get(AMDGPU::V_MADMK_F32));
+
+      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+      if (DeleteDef)
+        DefMI->eraseFromParent();
+
+      return true;
+    }
+
+    // Added part is the constant: Use v_madak_f32
+    if (Src2->isReg() && Src2->getReg() == Reg) {
+      // Not allowed to use constant bus for another operand.
+      // We can however allow an inline immediate as src0.
+      if (!Src0->isImm() &&
+          (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
+        return false;
+
+      if (!Src1->isReg() ||
+          (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+        return false;
+
+      const int64_t Imm = DefMI->getOperand(1).getImm();
+
+      // FIXME: This would be a lot easier if we could return a new instruction
+      // instead of having to modify in place.
+
+      // Remove these first since they are at the end.
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+                                                      AMDGPU::OpName::omod));
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+                                                      AMDGPU::OpName::clamp));
+
+      Src2->ChangeToImmediate(Imm);
+
+      // These come before src2.
+      removeModOperands(*UseMI);
+      UseMI->setDesc(get(AMDGPU::V_MADAK_F32));
+
+      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+      if (DeleteDef)
+        DefMI->eraseFromParent();
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool
+SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
+                                         AliasAnalysis *AA) const {
+  switch(MI->getOpcode()) {
+  default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA);
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::V_MOV_B32_e32:
+    return MI->getOperand(1).isImm();
+  }
+}
+
+static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
+                                int WidthB, int OffsetB) {
+  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
+  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
+  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+  return LowOffset + LowWidth <= HighOffset;
+}
+
+bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
+                                               MachineInstr *MIb) const {
+  unsigned BaseReg0, Offset0;
+  unsigned BaseReg1, Offset1;
+
+  if (getLdStBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
+      getLdStBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
+    assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() &&
+           "read2 / write2 not expected here yet");
+    unsigned Width0 = (*MIa->memoperands_begin())->getSize();
+    unsigned Width1 = (*MIb->memoperands_begin())->getSize();
+    if (BaseReg0 == BaseReg1 &&
+        offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
+                                                  MachineInstr *MIb,
+                                                  AliasAnalysis *AA) const {
+  unsigned Opc0 = MIa->getOpcode();
+  unsigned Opc1 = MIb->getOpcode();
+
+  assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
+         "MIa must load from or modify a memory location");
+  assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
+         "MIb must load from or modify a memory location");
+
+  if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects())
+    return false;
+
+  // XXX - Can we relax this between address spaces?
+  if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
+    return false;
+
+  // TODO: Should we check the address space from the MachineMemOperand? That
+  // would allow us to distinguish objects we know don't alias based on the
+  // underlying addres space, even if it was lowered to a different one,
+  // e.g. private accesses lowered to use MUBUF instructions on a scratch
+  // buffer.
+  if (isDS(Opc0)) {
+    if (isDS(Opc1))
+      return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+    return !isFLAT(Opc1);
+  }
+
+  if (isMUBUF(Opc0) || isMTBUF(Opc0)) {
+    if (isMUBUF(Opc1) || isMTBUF(Opc1))
+      return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+    return !isFLAT(Opc1) && !isSMRD(Opc1);
+  }
+
+  if (isSMRD(Opc0)) {
+    if (isSMRD(Opc1))
+      return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+    return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0);
+  }
+
+  if (isFLAT(Opc0)) {
+    if (isFLAT(Opc1))
+      return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+    return false;
+  }
+
+  return false;
+}
+
+bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
+  int64_t SVal = Imm.getSExtValue();
+  if (SVal >= -16 && SVal <= 64)
+    return true;
+
+  if (Imm.getBitWidth() == 64) {
+    uint64_t Val = Imm.getZExtValue();
+    return (DoubleToBits(0.0) == Val) ||
+           (DoubleToBits(1.0) == Val) ||
+           (DoubleToBits(-1.0) == Val) ||
+           (DoubleToBits(0.5) == Val) ||
+           (DoubleToBits(-0.5) == Val) ||
+           (DoubleToBits(2.0) == Val) ||
+           (DoubleToBits(-2.0) == Val) ||
+           (DoubleToBits(4.0) == Val) ||
+           (DoubleToBits(-4.0) == Val);
+  }
+
+  // The actual type of the operand does not seem to matter as long
+  // as the bits match one of the inline immediate values.  For example:
+  //
+  // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
+  // so it is a legal inline immediate.
+  //
+  // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
+  // floating-point, so it is a legal inline immediate.
+  uint32_t Val = Imm.getZExtValue();
+
+  return (FloatToBits(0.0f) == Val) ||
+         (FloatToBits(1.0f) == Val) ||
+         (FloatToBits(-1.0f) == Val) ||
+         (FloatToBits(0.5f) == Val) ||
+         (FloatToBits(-0.5f) == Val) ||
+         (FloatToBits(2.0f) == Val) ||
+         (FloatToBits(-2.0f) == Val) ||
+         (FloatToBits(4.0f) == Val) ||
+         (FloatToBits(-4.0f) == Val);
+}
+
+bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
+                                   unsigned OpSize) const {
+  if (MO.isImm()) {
+    // MachineOperand provides no way to tell the true operand size, since it
+    // only records a 64-bit value. We need to know the size to determine if a
+    // 32-bit floating point immediate bit pattern is legal for an integer
+    // immediate. It would be for any 32-bit integer operand, but would not be
+    // for a 64-bit one.
+
+    unsigned BitSize = 8 * OpSize;
+    return isInlineConstant(APInt(BitSize, MO.getImm(), true));
+  }
+
+  return false;
+}
+
+bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO,
+                                    unsigned OpSize) const {
+  return MO.isImm() && !isInlineConstant(MO, OpSize);
+}
+
+static bool compareMachineOp(const MachineOperand &Op0,
+                             const MachineOperand &Op1) {
+  if (Op0.getType() != Op1.getType())
+    return false;
+
+  switch (Op0.getType()) {
+  case MachineOperand::MO_Register:
+    return Op0.getReg() == Op1.getReg();
+  case MachineOperand::MO_Immediate:
+    return Op0.getImm() == Op1.getImm();
+  default:
+    llvm_unreachable("Didn't expect to be comparing these operand types");
+  }
+}
+
+bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
+                                 const MachineOperand &MO) const {
+  const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
+
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+
+  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
+    return true;
+
+  if (OpInfo.RegClass < 0)
+    return false;
+
+  unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize();
+  if (isLiteralConstant(MO, OpSize))
+    return RI.opCanUseLiteralConstant(OpInfo.OperandType);
+
+  return RI.opCanUseInlineConstant(OpInfo.OperandType);
+}
+
+bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
+  int Op32 = AMDGPU::getVOPe32(Opcode);
+  if (Op32 == -1)
+    return false;
+
+  return pseudoToMCOpcode(Op32) != -1;
+}
+
+bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
+  // The src0_modifier operand is present on all instructions
+  // that have modifiers.
+
+  return AMDGPU::getNamedOperandIdx(Opcode,
+                                    AMDGPU::OpName::src0_modifiers) != -1;
+}
+
+bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
+                                  unsigned OpName) const {
+  const MachineOperand *Mods = getNamedOperand(MI, OpName);
+  return Mods && Mods->getImm();
+}
+
+bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
+                                  const MachineOperand &MO,
+                                  unsigned OpSize) const {
+  // Literal constants use the constant bus.
+  if (isLiteralConstant(MO, OpSize))
+    return true;
+
+  if (!MO.isReg() || !MO.isUse())
+    return false;
+
+  if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
+
+  // FLAT_SCR is just an SGPR pair.
+  if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
+    return true;
+
+  // EXEC register uses the constant bus.
+  if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
+    return true;
+
+  // SGPRs use the constant bus
+  if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
+      (!MO.isImplicit() &&
+      (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
+       AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) {
+    return true;
+  }
+
+  return false;
+}
+
+bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
+                                    StringRef &ErrInfo) const {
+  uint16_t Opcode = MI->getOpcode();
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+  // Make sure the number of operands is correct.
+  const MCInstrDesc &Desc = get(Opcode);
+  if (!Desc.isVariadic() &&
+      Desc.getNumOperands() != MI->getNumExplicitOperands()) {
+     ErrInfo = "Instruction has wrong number of operands.";
+     return false;
+  }
+
+  // Make sure the register classes are correct
+  for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
+    if (MI->getOperand(i).isFPImm()) {
+      ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
+                "all fp values to integers.";
+      return false;
+    }
+
+    int RegClass = Desc.OpInfo[i].RegClass;
+
+    switch (Desc.OpInfo[i].OperandType) {
+    case MCOI::OPERAND_REGISTER:
+      if (MI->getOperand(i).isImm()) {
+        ErrInfo = "Illegal immediate value for operand.";
+        return false;
+      }
+      break;
+    case AMDGPU::OPERAND_REG_IMM32:
+      break;
+    case AMDGPU::OPERAND_REG_INLINE_C:
+      if (isLiteralConstant(MI->getOperand(i),
+                            RI.getRegClass(RegClass)->getSize())) {
+        ErrInfo = "Illegal immediate value for operand.";
+        return false;
+      }
+      break;
+    case MCOI::OPERAND_IMMEDIATE:
+      // Check if this operand is an immediate.
+      // FrameIndex operands will be replaced by immediates, so they are
+      // allowed.
+      if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) {
+        ErrInfo = "Expected immediate, but got non-immediate";
+        return false;
+      }
+      // Fall-through
+    default:
+      continue;
+    }
+
+    if (!MI->getOperand(i).isReg())
+      continue;
+
+    if (RegClass != -1) {
+      unsigned Reg = MI->getOperand(i).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+
+      const TargetRegisterClass *RC = RI.getRegClass(RegClass);
+      if (!RC->contains(Reg)) {
+        ErrInfo = "Operand has incorrect register class.";
+        return false;
+      }
+    }
+  }
+
+
+  // Verify VOP*
+  if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
+    // Only look at the true operands. Only a real operand can use the constant
+    // bus, and we don't want to check pseudo-operands like the source modifier
+    // flags.
+    const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
+    unsigned ConstantBusCount = 0;
+    unsigned SGPRUsed = AMDGPU::NoRegister;
+    for (int OpIdx : OpIndices) {
+      if (OpIdx == -1)
+        break;
+      const MachineOperand &MO = MI->getOperand(OpIdx);
+      if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) {
+        if (MO.isReg()) {
+          if (MO.getReg() != SGPRUsed)
+            ++ConstantBusCount;
+          SGPRUsed = MO.getReg();
+        } else {
+          ++ConstantBusCount;
+        }
+      }
+    }
+    if (ConstantBusCount > 1) {
+      ErrInfo = "VOP* instruction uses the constant bus more than once";
+      return false;
+    }
+  }
+
+  // Verify misc. restrictions on specific instructions.
+  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
+      Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
+    const MachineOperand &Src0 = MI->getOperand(Src0Idx);
+    const MachineOperand &Src1 = MI->getOperand(Src1Idx);
+    const MachineOperand &Src2 = MI->getOperand(Src2Idx);
+    if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
+      if (!compareMachineOp(Src0, Src1) &&
+          !compareMachineOp(Src0, Src2)) {
+        ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default: return AMDGPU::INSTRUCTION_LIST_END;
+  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
+  case AMDGPU::COPY: return AMDGPU::COPY;
+  case AMDGPU::PHI: return AMDGPU::PHI;
+  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
+  case AMDGPU::S_MOV_B32:
+    return MI.getOperand(1).isReg() ?
+           AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
+  case AMDGPU::S_ADD_I32:
+  case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
+  case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
+  case AMDGPU::S_SUB_I32:
+  case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
+  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
+  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
+  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32;
+  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32;
+  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32;
+  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32;
+  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32;
+  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32;
+  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32;
+  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
+  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
+  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
+  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
+  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
+  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
+  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
+  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
+  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
+  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
+  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
+  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
+  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
+  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
+  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
+  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
+  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
+  case AMDGPU::S_LOAD_DWORD_IMM:
+  case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
+  case AMDGPU::S_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
+  case AMDGPU::S_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
+  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
+  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
+  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
+  }
+}
+
+bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
+  return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
+}
+
+const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
+                                                      unsigned OpNo) const {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  const MCInstrDesc &Desc = get(MI.getOpcode());
+  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
+      Desc.OpInfo[OpNo].RegClass == -1) {
+    unsigned Reg = MI.getOperand(OpNo).getReg();
+
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      return MRI.getRegClass(Reg);
+    return RI.getPhysRegClass(Reg);
+  }
+
+  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
+  return RI.getRegClass(RCID);
+}
+
+bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
+  switch (MI.getOpcode()) {
+  case AMDGPU::COPY:
+  case AMDGPU::REG_SEQUENCE:
+  case AMDGPU::PHI:
+  case AMDGPU::INSERT_SUBREG:
+    return RI.hasVGPRs(getOpRegClass(MI, 0));
+  default:
+    return RI.hasVGPRs(getOpRegClass(MI, OpNo));
+  }
+}
+
+void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
+  MachineBasicBlock::iterator I = MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineOperand &MO = MI->getOperand(OpIdx);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass;
+  const TargetRegisterClass *RC = RI.getRegClass(RCID);
+  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+  if (MO.isReg())
+    Opcode = AMDGPU::COPY;
+  else if (RI.isSGPRClass(RC))
+    Opcode = AMDGPU::S_MOV_B32;
+
+
+  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
+  if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
+    VRC = &AMDGPU::VReg_64RegClass;
+  else
+    VRC = &AMDGPU::VGPR_32RegClass;
+
+  unsigned Reg = MRI.createVirtualRegister(VRC);
+  DebugLoc DL = MBB->findDebugLoc(I);
+  BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg)
+    .addOperand(MO);
+  MO.ChangeToRegister(Reg, false);
+}
+
+unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
+                                         MachineRegisterInfo &MRI,
+                                         MachineOperand &SuperReg,
+                                         const TargetRegisterClass *SuperRC,
+                                         unsigned SubIdx,
+                                         const TargetRegisterClass *SubRC)
+                                         const {
+  assert(SuperReg.isReg());
+
+  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
+  unsigned SubReg = MRI.createVirtualRegister(SubRC);
+
+  // Just in case the super register is itself a sub-register, copy it to a new
+  // value so we don't need to worry about merging its subreg index with the
+  // SubIdx passed to this function. The register coalescer should be able to
+  // eliminate this extra copy.
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
+    .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
+
+  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
+    .addReg(NewSuperReg, 0, SubIdx);
+
+  return SubReg;
+}
+
+MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
+  MachineBasicBlock::iterator MII,
+  MachineRegisterInfo &MRI,
+  MachineOperand &Op,
+  const TargetRegisterClass *SuperRC,
+  unsigned SubIdx,
+  const TargetRegisterClass *SubRC) const {
+  if (Op.isImm()) {
+    // XXX - Is there a better way to do this?
+    if (SubIdx == AMDGPU::sub0)
+      return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF);
+    if (SubIdx == AMDGPU::sub1)
+      return MachineOperand::CreateImm(Op.getImm() >> 32);
+
+    llvm_unreachable("Unhandled register index for immediate");
+  }
+
+  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
+                                       SubIdx, SubRC);
+  return MachineOperand::CreateReg(SubReg, false);
+}
+
+unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
+                                    MachineBasicBlock::iterator MI,
+                                    MachineRegisterInfo &MRI,
+                                    const TargetRegisterClass *RC,
+                                    const MachineOperand &Op) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  unsigned Dst = MRI.createVirtualRegister(RC);
+
+  MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
+                             LoDst)
+    .addImm(Op.getImm() & 0xFFFFFFFF);
+  MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
+                             HiDst)
+    .addImm(Op.getImm() >> 32);
+
+  BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst)
+    .addReg(LoDst)
+    .addImm(AMDGPU::sub0)
+    .addReg(HiDst)
+    .addImm(AMDGPU::sub1);
+
+  Worklist.push_back(Lo);
+  Worklist.push_back(Hi);
+
+  return Dst;
+}
+
+// Change the order of operands from (0, 1, 2) to (0, 2, 1)
+void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
+  assert(Inst->getNumExplicitOperands() == 3);
+  MachineOperand Op1 = Inst->getOperand(1);
+  Inst->RemoveOperand(1);
+  Inst->addOperand(Op1);
+}
+
+bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
+                                 const MachineOperand *MO) const {
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  const MCInstrDesc &InstDesc = get(MI->getOpcode());
+  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
+  const TargetRegisterClass *DefinedRC =
+      OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
+  if (!MO)
+    MO = &MI->getOperand(OpIdx);
+
+  if (isVALU(InstDesc.Opcode) &&
+      usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
+    unsigned SGPRUsed =
+        MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      if (i == OpIdx)
+        continue;
+      const MachineOperand &Op = MI->getOperand(i);
+      if (Op.isReg() && Op.getReg() != SGPRUsed &&
+          usesConstantBus(MRI, Op, getOpSize(*MI, i))) {
+        return false;
+      }
+    }
+  }
+
+  if (MO->isReg()) {
+    assert(DefinedRC);
+    const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg());
+
+    // In order to be legal, the common sub-class must be equal to the
+    // class of the current operand.  For example:
+    //
+    // v_mov_b32 s0 ; Operand defined as vsrc_32
+    //              ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
+    //
+    // s_sendmsg 0, s0 ; Operand defined as m0reg
+    //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
+
+    return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+  }
+
+
+  // Handle non-register types that are treated like immediates.
+  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
+
+  if (!DefinedRC) {
+    // This operand expects an immediate.
+    return true;
+  }
+
+  return isImmOperandLegal(MI, OpIdx, *MO);
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+  int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src0);
+  int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src1);
+  int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src2);
+
+  // Legalize VOP2
+  if (isVOP2(MI->getOpcode()) && Src1Idx != -1) {
+    // Legalize src0
+    if (!isOperandLegal(MI, Src0Idx))
+      legalizeOpWithMove(MI, Src0Idx);
+
+    // Legalize src1
+    if (isOperandLegal(MI, Src1Idx))
+      return;
+
+    // Usually src0 of VOP2 instructions allow more types of inputs
+    // than src1, so try to commute the instruction to decrease our
+    // chances of having to insert a MOV instruction to legalize src1.
+    if (MI->isCommutable()) {
+      if (commuteInstruction(MI))
+        // If we are successful in commuting, then we know MI is legal, so
+        // we are done.
+        return;
+    }
+
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
+  }
+
+  // XXX - Do any VOP3 instructions read VCC?
+  // Legalize VOP3
+  if (isVOP3(MI->getOpcode())) {
+    int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx };
+
+    // Find the one SGPR operand we are allowed to use.
+    unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      int Idx = VOP3Idx[i];
+      if (Idx == -1)
+        break;
+      MachineOperand &MO = MI->getOperand(Idx);
+
+      if (MO.isReg()) {
+        if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+          continue; // VGPRs are legal
+
+        assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction");
+
+        if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
+          SGPRReg = MO.getReg();
+          // We can use one SGPR in each VOP3 instruction.
+          continue;
+        }
+      } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) {
+        // If it is not a register and not a literal constant, then it must be
+        // an inline constant which is always legal.
+        continue;
+      }
+      // If we make it this far, then the operand is not legal and we must
+      // legalize it.
+      legalizeOpWithMove(MI, Idx);
+    }
+  }
+
+  // Legalize REG_SEQUENCE and PHI
+  // The register class of the operands much be the same type as the register
+  // class of the output.
+  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE ||
+      MI->getOpcode() == AMDGPU::PHI) {
+    const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
+    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
+      if (!MI->getOperand(i).isReg() ||
+          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+        continue;
+      const TargetRegisterClass *OpRC =
+              MRI.getRegClass(MI->getOperand(i).getReg());
+      if (RI.hasVGPRs(OpRC)) {
+        VRC = OpRC;
+      } else {
+        SRC = OpRC;
+      }
+    }
+
+    // If any of the operands are VGPR registers, then they all most be
+    // otherwise we will create illegal VGPR->SGPR copies when legalizing
+    // them.
+    if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) {
+      if (!VRC) {
+        assert(SRC);
+        VRC = RI.getEquivalentVGPRClass(SRC);
+      }
+      RC = VRC;
+    } else {
+      RC = SRC;
+    }
+
+    // Update all the operands so they have the same type.
+    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
+      if (!MI->getOperand(i).isReg() ||
+          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+        continue;
+      unsigned DstReg = MRI.createVirtualRegister(RC);
+      MachineBasicBlock *InsertBB;
+      MachineBasicBlock::iterator Insert;
+      if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+        InsertBB = MI->getParent();
+        Insert = MI;
+      } else {
+        // MI is a PHI instruction.
+        InsertBB = MI->getOperand(i + 1).getMBB();
+        Insert = InsertBB->getFirstTerminator();
+      }
+      BuildMI(*InsertBB, Insert, MI->getDebugLoc(),
+              get(AMDGPU::COPY), DstReg)
+              .addOperand(MI->getOperand(i));
+      MI->getOperand(i).setReg(DstReg);
+    }
+  }
+
+  // Legalize INSERT_SUBREG
+  // src0 must have the same register class as dst
+  if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) {
+    unsigned Dst = MI->getOperand(0).getReg();
+    unsigned Src0 = MI->getOperand(1).getReg();
+    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
+    if (DstRC != Src0RC) {
+      MachineBasicBlock &MBB = *MI->getParent();
+      unsigned NewSrc0 = MRI.createVirtualRegister(DstRC);
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
+              .addReg(Src0);
+      MI->getOperand(1).setReg(NewSrc0);
+    }
+    return;
+  }
+
+  // Legalize MUBUF* instructions
+  // FIXME: If we start using the non-addr64 instructions for compute, we
+  // may need to legalize them here.
+  int SRsrcIdx =
+      AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
+  if (SRsrcIdx != -1) {
+    // We have an MUBUF instruction
+    MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx);
+    unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass;
+    if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
+                                             RI.getRegClass(SRsrcRC))) {
+      // The operands are legal.
+      // FIXME: We may need to legalize operands besided srsrc.
+      return;
+    }
+
+    MachineBasicBlock &MBB = *MI->getParent();
+    // Extract the the ptr from the resource descriptor.
+
+    // SRsrcPtrLo = srsrc:sub0
+    unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc,
+        &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass);
+
+    // SRsrcPtrHi = srsrc:sub1
+    unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc,
+        &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass);
+
+    // Create an empty resource descriptor
+    unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+    uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
+
+    // Zero64 = 0
+    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
+            Zero64)
+            .addImm(0);
+
+    // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
+    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+            SRsrcFormatLo)
+            .addImm(RsrcDataFormat & 0xFFFFFFFF);
+
+    // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
+    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+            SRsrcFormatHi)
+            .addImm(RsrcDataFormat >> 32);
+
+    // NewSRsrc = {Zero64, SRsrcFormat}
+    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+            NewSRsrc)
+            .addReg(Zero64)
+            .addImm(AMDGPU::sub0_sub1)
+            .addReg(SRsrcFormatLo)
+            .addImm(AMDGPU::sub2)
+            .addReg(SRsrcFormatHi)
+            .addImm(AMDGPU::sub3);
+
+    MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
+    unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+    unsigned NewVAddrLo;
+    unsigned NewVAddrHi;
+    if (VAddr) {
+      // This is already an ADDR64 instruction so we need to add the pointer
+      // extracted from the resource descriptor to the current value of VAddr.
+      NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+      // NewVaddrLo = SRsrcPtrLo + VAddr:sub0
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
+              NewVAddrLo)
+              .addReg(SRsrcPtrLo)
+              .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
+              .addReg(AMDGPU::VCC, RegState::ImplicitDefine);
+
+      // NewVaddrHi = SRsrcPtrHi + VAddr:sub1
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
+              NewVAddrHi)
+              .addReg(SRsrcPtrHi)
+              .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
+              .addReg(AMDGPU::VCC, RegState::ImplicitDefine)
+              .addReg(AMDGPU::VCC, RegState::Implicit);
+
+    } else {
+      // This instructions is the _OFFSET variant, so we need to convert it to
+      // ADDR64.
+      MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
+      MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
+      MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
+
+      // Create the new instruction.
+      unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
+      MachineInstr *Addr64 =
+          BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
+                  .addOperand(*VData)
+                  .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+                                              // This will be replaced later
+                                              // with the new value of vaddr.
+                  .addOperand(*SRsrc)
+                  .addOperand(*SOffset)
+                  .addOperand(*Offset)
+                  .addImm(0) // glc
+                  .addImm(0) // slc
+                  .addImm(0); // tfe
+
+      MI->removeFromParent();
+      MI = Addr64;
+
+      NewVAddrLo = SRsrcPtrLo;
+      NewVAddrHi = SRsrcPtrHi;
+      VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
+      SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
+    }
+
+    // NewVaddr = {NewVaddrHi, NewVaddrLo}
+    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+            NewVAddr)
+            .addReg(NewVAddrLo)
+            .addImm(AMDGPU::sub0)
+            .addReg(NewVAddrHi)
+            .addImm(AMDGPU::sub1);
+
+
+    // Update the instruction to use NewVaddr
+    VAddr->setReg(NewVAddr);
+    // Update the instruction to use NewSRsrc
+    SRsrc->setReg(NewSRsrc);
+  }
+}
+
+void SIInstrInfo::splitSMRD(MachineInstr *MI,
+                            const TargetRegisterClass *HalfRC,
+                            unsigned HalfImmOp, unsigned HalfSGPROp,
+                            MachineInstr *&Lo, MachineInstr *&Hi) const {
+
+  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  unsigned RegLo = MRI.createVirtualRegister(HalfRC);
+  unsigned RegHi = MRI.createVirtualRegister(HalfRC);
+  unsigned HalfSize = HalfRC->getSize();
+  const MachineOperand *OffOp =
+      getNamedOperand(*MI, AMDGPU::OpName::offset);
+  const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
+
+  // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes
+  // on VI.
+
+  bool IsKill = SBase->isKill();
+  if (OffOp) {
+    bool isVI =
+        MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
+        AMDGPUSubtarget::VOLCANIC_ISLANDS;
+    unsigned OffScale = isVI ? 1 : 4;
+    // Handle the _IMM variant
+    unsigned LoOffset = OffOp->getImm() * OffScale;
+    unsigned HiOffset = LoOffset + HalfSize;
+    Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo)
+                  // Use addReg instead of addOperand
+                  // to make sure kill flag is cleared.
+                  .addReg(SBase->getReg(), 0, SBase->getSubReg())
+                  .addImm(LoOffset / OffScale);
+
+    if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) {
+      unsigned OffsetSGPR =
+          MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR)
+              .addImm(HiOffset); // The offset in register is in bytes.
+      Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
+                    .addReg(SBase->getReg(), getKillRegState(IsKill),
+                            SBase->getSubReg())
+                    .addReg(OffsetSGPR);
+    } else {
+      Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi)
+                     .addReg(SBase->getReg(), getKillRegState(IsKill),
+                             SBase->getSubReg())
+                     .addImm(HiOffset / OffScale);
+    }
+  } else {
+    // Handle the _SGPR variant
+    MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff);
+    Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo)
+                  .addReg(SBase->getReg(), 0, SBase->getSubReg())
+                  .addOperand(*SOff);
+    unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR)
+            .addOperand(*SOff)
+            .addImm(HalfSize);
+    Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp))
+                  .addReg(SBase->getReg(), getKillRegState(IsKill),
+                          SBase->getSubReg())
+                  .addReg(OffsetSGPR);
+  }
+
+  unsigned SubLo, SubHi;
+  switch (HalfSize) {
+    case 4:
+      SubLo = AMDGPU::sub0;
+      SubHi = AMDGPU::sub1;
+      break;
+    case 8:
+      SubLo = AMDGPU::sub0_sub1;
+      SubHi = AMDGPU::sub2_sub3;
+      break;
+    case 16:
+      SubLo = AMDGPU::sub0_sub1_sub2_sub3;
+      SubHi = AMDGPU::sub4_sub5_sub6_sub7;
+      break;
+    case 32:
+      SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
+      SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15;
+      break;
+    default:
+      llvm_unreachable("Unhandled HalfSize");
+  }
+
+  BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE))
+          .addOperand(MI->getOperand(0))
+          .addReg(RegLo)
+          .addImm(SubLo)
+          .addReg(RegHi)
+          .addImm(SubHi);
+}
+
+void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  switch (MI->getOpcode()) {
+    case AMDGPU::S_LOAD_DWORD_IMM:
+    case AMDGPU::S_LOAD_DWORD_SGPR:
+    case AMDGPU::S_LOAD_DWORDX2_IMM:
+    case AMDGPU::S_LOAD_DWORDX2_SGPR:
+    case AMDGPU::S_LOAD_DWORDX4_IMM:
+    case AMDGPU::S_LOAD_DWORDX4_SGPR: {
+      unsigned NewOpcode = getVALUOp(*MI);
+      unsigned RegOffset;
+      unsigned ImmOffset;
+
+      if (MI->getOperand(2).isReg()) {
+        RegOffset = MI->getOperand(2).getReg();
+        ImmOffset = 0;
+      } else {
+        assert(MI->getOperand(2).isImm());
+        // SMRD instructions take a dword offsets on SI and byte offset on VI
+        // and MUBUF instructions always take a byte offset.
+        ImmOffset = MI->getOperand(2).getImm();
+        if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <=
+            AMDGPUSubtarget::SEA_ISLANDS)
+          ImmOffset <<= 2;
+        RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+        if (isUInt<12>(ImmOffset)) {
+          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+                  RegOffset)
+                  .addImm(0);
+        } else {
+          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+                  RegOffset)
+                  .addImm(ImmOffset);
+          ImmOffset = 0;
+        }
+      }
+
+      unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+      unsigned DWord0 = RegOffset;
+      unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
+
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
+              .addImm(0);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
+              .addImm(RsrcDataFormat & 0xFFFFFFFF);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
+              .addImm(RsrcDataFormat >> 32);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
+              .addReg(DWord0)
+              .addImm(AMDGPU::sub0)
+              .addReg(DWord1)
+              .addImm(AMDGPU::sub1)
+              .addReg(DWord2)
+              .addImm(AMDGPU::sub2)
+              .addReg(DWord3)
+              .addImm(AMDGPU::sub3);
+      MI->setDesc(get(NewOpcode));
+      if (MI->getOperand(2).isReg()) {
+        MI->getOperand(2).setReg(SRsrc);
+      } else {
+        MI->getOperand(2).ChangeToRegister(SRsrc, false);
+      }
+      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0));
+      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
+      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc
+      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc
+      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe
+
+      const TargetRegisterClass *NewDstRC =
+          RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass);
+
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
+      MRI.replaceRegWith(DstReg, NewDstReg);
+      break;
+    }
+    case AMDGPU::S_LOAD_DWORDX8_IMM:
+    case AMDGPU::S_LOAD_DWORDX8_SGPR: {
+      MachineInstr *Lo, *Hi;
+      splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM,
+                AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi);
+      MI->eraseFromParent();
+      moveSMRDToVALU(Lo, MRI);
+      moveSMRDToVALU(Hi, MRI);
+      break;
+    }
+
+    case AMDGPU::S_LOAD_DWORDX16_IMM:
+    case AMDGPU::S_LOAD_DWORDX16_SGPR: {
+      MachineInstr *Lo, *Hi;
+      splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM,
+                AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi);
+      MI->eraseFromParent();
+      moveSMRDToVALU(Lo, MRI);
+      moveSMRDToVALU(Hi, MRI);
+      break;
+    }
+  }
+}
+
+void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
+  SmallVector<MachineInstr *, 128> Worklist;
+  Worklist.push_back(&TopInst);
+
+  while (!Worklist.empty()) {
+    MachineInstr *Inst = Worklist.pop_back_val();
+    MachineBasicBlock *MBB = Inst->getParent();
+    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+    unsigned Opcode = Inst->getOpcode();
+    unsigned NewOpcode = getVALUOp(*Inst);
+
+    // Handle some special cases
+    switch (Opcode) {
+    default:
+      if (isSMRD(Inst->getOpcode())) {
+        moveSMRDToVALU(Inst, MRI);
+      }
+      break;
+    case AMDGPU::S_MOV_B64: {
+      DebugLoc DL = Inst->getDebugLoc();
+
+      // If the source operand is a register we can replace this with a
+      // copy.
+      if (Inst->getOperand(1).isReg()) {
+        MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY))
+          .addOperand(Inst->getOperand(0))
+          .addOperand(Inst->getOperand(1));
+        Worklist.push_back(Copy);
+      } else {
+        // Otherwise, we need to split this into two movs, because there is
+        // no 64-bit VALU move instruction.
+        unsigned Reg = Inst->getOperand(0).getReg();
+        unsigned Dst = split64BitImm(Worklist,
+                                     Inst,
+                                     MRI,
+                                     MRI.getRegClass(Reg),
+                                     Inst->getOperand(1));
+        MRI.replaceRegWith(Reg, Dst);
+      }
+      Inst->eraseFromParent();
+      continue;
+    }
+    case AMDGPU::S_AND_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_OR_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_XOR_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_NOT_B64:
+      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_BCNT1_I32_B64:
+      splitScalar64BitBCNT(Worklist, Inst);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_BFE_I64: {
+      splitScalar64BitBFE(Worklist, Inst);
+      Inst->eraseFromParent();
+      continue;
+    }
+
+    case AMDGPU::S_LSHL_B32:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_ASHR_I32:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_LSHR_B32:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_LSHL_B64:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_LSHLREV_B64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_ASHR_I64:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_ASHRREV_I64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_LSHR_B64:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_LSHRREV_B64;
+        swapOperands(Inst);
+      }
+      break;
+
+    case AMDGPU::S_BFE_U64:
+    case AMDGPU::S_BFM_B64:
+      llvm_unreachable("Moving this op to VALU not implemented");
+    }
+
+    if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
+      // We cannot move this instruction to the VALU, so we should try to
+      // legalize its operands instead.
+      legalizeOperands(Inst);
+      continue;
+    }
+
+    // Use the new VALU Opcode.
+    const MCInstrDesc &NewDesc = get(NewOpcode);
+    Inst->setDesc(NewDesc);
+
+    // Remove any references to SCC. Vector instructions can't read from it, and
+    // We're just about to add the implicit use / defs of VCC, and we don't want
+    // both.
+    for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) {
+      MachineOperand &Op = Inst->getOperand(i);
+      if (Op.isReg() && Op.getReg() == AMDGPU::SCC)
+        Inst->RemoveOperand(i);
+    }
+
+    if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
+      // We are converting these to a BFE, so we need to add the missing
+      // operands for the size and offset.
+      unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(Size));
+
+    } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
+      // The VALU version adds the second operand to the result, so insert an
+      // extra 0 operand.
+      Inst->addOperand(MachineOperand::CreateImm(0));
+    }
+
+    addDescImplicitUseDef(NewDesc, Inst);
+
+    if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+      const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
+      // If we need to move this to VGPRs, we need to unpack the second operand
+      // back into the 2 separate ones for bit offset and width.
+      assert(OffsetWidthOp.isImm() &&
+             "Scalar BFE is only implemented for constant width and offset");
+      uint32_t Imm = OffsetWidthOp.getImm();
+
+      uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+      uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+      Inst->RemoveOperand(2); // Remove old immediate.
+      Inst->addOperand(MachineOperand::CreateImm(Offset));
+      Inst->addOperand(MachineOperand::CreateImm(BitWidth));
+    }
+
+    // Update the destination register class.
+
+    const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
+
+    switch (Opcode) {
+      // For target instructions, getOpRegClass just returns the virtual
+      // register class associated with the operand, so we need to find an
+      // equivalent VGPR register class in order to move the instruction to the
+      // VALU.
+    case AMDGPU::COPY:
+    case AMDGPU::PHI:
+    case AMDGPU::REG_SEQUENCE:
+    case AMDGPU::INSERT_SUBREG:
+      if (RI.hasVGPRs(NewDstRC))
+        continue;
+      NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+      if (!NewDstRC)
+        continue;
+      break;
+    default:
+      break;
+    }
+
+    unsigned DstReg = Inst->getOperand(0).getReg();
+    unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
+    MRI.replaceRegWith(DstReg, NewDstReg);
+
+    // Legalize the operands
+    legalizeOperands(Inst);
+
+    for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
+           E = MRI.use_end(); I != E; ++I) {
+      MachineInstr &UseMI = *I->getParent();
+      if (!canReadVGPR(UseMI, I.getOperandNo())) {
+        Worklist.push_back(&UseMI);
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Indirect addressing callbacks
+//===----------------------------------------------------------------------===//
+
+unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
+                                                 unsigned Channel) const {
+  assert(Channel == 0);
+  return RegIndex;
+}
+
+const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
+  return &AMDGPU::VGPR_32RegClass;
+}
+
+void SIInstrInfo::splitScalar64BitUnaryOp(
+  SmallVectorImpl<MachineInstr *> &Worklist,
+  MachineInstr *Inst,
+  unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  MachineOperand &Src0 = Inst->getOperand(1);
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineBasicBlock::iterator MII = Inst;
+
+  const MCInstrDesc &InstDesc = get(Opcode);
+  const TargetRegisterClass *Src0RC = Src0.isReg() ?
+    MRI.getRegClass(Src0.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+
+  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub0, Src0SubRC);
+
+  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+  const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+
+  unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
+  MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+    .addOperand(SrcReg0Sub0);
+
+  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub1, Src0SubRC);
+
+  unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
+  MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+    .addOperand(SrcReg0Sub1);
+
+  unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+    .addReg(DestSub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(DestSub1)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+  // Try to legalize the operands in case we need to swap the order to keep it
+  // valid.
+  Worklist.push_back(LoHalf);
+  Worklist.push_back(HiHalf);
+}
+
+void SIInstrInfo::splitScalar64BitBinaryOp(
+  SmallVectorImpl<MachineInstr *> &Worklist,
+  MachineInstr *Inst,
+  unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  MachineOperand &Src0 = Inst->getOperand(1);
+  MachineOperand &Src1 = Inst->getOperand(2);
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineBasicBlock::iterator MII = Inst;
+
+  const MCInstrDesc &InstDesc = get(Opcode);
+  const TargetRegisterClass *Src0RC = Src0.isReg() ?
+    MRI.getRegClass(Src0.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+  const TargetRegisterClass *Src1RC = Src1.isReg() ?
+    MRI.getRegClass(Src1.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
+
+  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub0, Src0SubRC);
+  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+                                                       AMDGPU::sub0, Src1SubRC);
+
+  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+  const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+
+  unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
+  MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+    .addOperand(SrcReg0Sub0)
+    .addOperand(SrcReg1Sub0);
+
+  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub1, Src0SubRC);
+  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+                                                       AMDGPU::sub1, Src1SubRC);
+
+  unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
+  MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+    .addOperand(SrcReg0Sub1)
+    .addOperand(SrcReg1Sub1);
+
+  unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+    .addReg(DestSub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(DestSub1)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+  // Try to legalize the operands in case we need to swap the order to keep it
+  // valid.
+  Worklist.push_back(LoHalf);
+  Worklist.push_back(HiHalf);
+}
+
+void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+                                       MachineInstr *Inst) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineBasicBlock::iterator MII = Inst;
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  MachineOperand &Src = Inst->getOperand(1);
+
+  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
+  const TargetRegisterClass *SrcRC = Src.isReg() ?
+    MRI.getRegClass(Src.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
+
+  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+                                                      AMDGPU::sub0, SrcSubRC);
+  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+                                                      AMDGPU::sub1, SrcSubRC);
+
+  MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg)
+    .addOperand(SrcRegSub0)
+    .addImm(0);
+
+  MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg)
+    .addOperand(SrcRegSub1)
+    .addReg(MidReg);
+
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+
+  Worklist.push_back(First);
+  Worklist.push_back(Second);
+}
+
+void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+                                      MachineInstr *Inst) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineBasicBlock::iterator MII = Inst;
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  uint32_t Imm = Inst->getOperand(2).getImm();
+  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+
+  (void) Offset;
+
+  // Only sext_inreg cases handled.
+  assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 &&
+         BitWidth <= 32 &&
+         Offset == 0 &&
+         "Not implemented");
+
+  if (BitWidth < 32) {
+    unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+
+    BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
+      .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0)
+      .addImm(0)
+      .addImm(BitWidth);
+
+    BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
+      .addImm(31)
+      .addReg(MidRegLo);
+
+    BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
+      .addReg(MidRegLo)
+      .addImm(AMDGPU::sub0)
+      .addReg(MidRegHi)
+      .addImm(AMDGPU::sub1);
+
+    MRI.replaceRegWith(Dest.getReg(), ResultReg);
+    return;
+  }
+
+  MachineOperand &Src = Inst->getOperand(1);
+  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+
+  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
+    .addImm(31)
+    .addReg(Src.getReg(), 0, AMDGPU::sub0);
+
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
+    .addReg(Src.getReg(), 0, AMDGPU::sub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(TmpReg)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+}
+
+void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
+                                        MachineInstr *Inst) const {
+  // Add the implict and explicit register definitions.
+  if (NewDesc.ImplicitUses) {
+    for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
+      unsigned Reg = NewDesc.ImplicitUses[i];
+      Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
+    }
+  }
+
+  if (NewDesc.ImplicitDefs) {
+    for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
+      unsigned Reg = NewDesc.ImplicitDefs[i];
+      Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
+    }
+  }
+}
+
+unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
+                                   int OpIndices[3]) const {
+  const MCInstrDesc &Desc = get(MI->getOpcode());
+
+  // Find the one SGPR operand we are allowed to use.
+  unsigned SGPRReg = AMDGPU::NoRegister;
+
+  // First we need to consider the instruction's operand requirements before
+  // legalizing. Some operands are required to be SGPRs, such as implicit uses
+  // of VCC, but we are still bound by the constant bus requirement to only use
+  // one.
+  //
+  // If the operand's class is an SGPR, we can never move it.
+
+  for (const MachineOperand &MO : MI->implicit_operands()) {
+    // We only care about reads.
+    if (MO.isDef())
+      continue;
+
+    if (MO.getReg() == AMDGPU::VCC)
+      return AMDGPU::VCC;
+
+    if (MO.getReg() == AMDGPU::FLAT_SCR)
+      return AMDGPU::FLAT_SCR;
+  }
+
+  unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+  for (unsigned i = 0; i < 3; ++i) {
+    int Idx = OpIndices[i];
+    if (Idx == -1)
+      break;
+
+    const MachineOperand &MO = MI->getOperand(Idx);
+    if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass))
+      SGPRReg = MO.getReg();
+
+    if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+      UsedSGPRs[i] = MO.getReg();
+  }
+
+  if (SGPRReg != AMDGPU::NoRegister)
+    return SGPRReg;
+
+  // We don't have a required SGPR operand, so we have a bit more freedom in
+  // selecting operands to move.
+
+  // Try to select the most used SGPR. If an SGPR is equal to one of the
+  // others, we choose that.
+  //
+  // e.g.
+  // V_FMA_F32 v0, s0, s0, s0 -> No moves
+  // V_FMA_F32 v0, s0, s1, s0 -> Move s1
+
+  if (UsedSGPRs[0] != AMDGPU::NoRegister) {
+    if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
+      SGPRReg = UsedSGPRs[0];
+  }
+
+  if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
+    if (UsedSGPRs[1] == UsedSGPRs[2])
+      SGPRReg = UsedSGPRs[1];
+  }
+
+  return SGPRReg;
+}
+
+MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
+                                   MachineBasicBlock *MBB,
+                                   MachineBasicBlock::iterator I,
+                                   unsigned ValueReg,
+                                   unsigned Address, unsigned OffsetReg) const {
+  const DebugLoc &DL = MBB->findDebugLoc(I);
+  unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
+                                      getIndirectIndexBegin(*MBB->getParent()));
+
+  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1))
+          .addReg(IndirectBaseReg, RegState::Define)
+          .addOperand(I->getOperand(0))
+          .addReg(IndirectBaseReg)
+          .addReg(OffsetReg)
+          .addImm(0)
+          .addReg(ValueReg);
+}
+
+MachineInstrBuilder SIInstrInfo::buildIndirectRead(
+                                   MachineBasicBlock *MBB,
+                                   MachineBasicBlock::iterator I,
+                                   unsigned ValueReg,
+                                   unsigned Address, unsigned OffsetReg) const {
+  const DebugLoc &DL = MBB->findDebugLoc(I);
+  unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
+                                      getIndirectIndexBegin(*MBB->getParent()));
+
+  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addReg(IndirectBaseReg)
+          .addReg(OffsetReg)
+          .addImm(0);
+
+}
+
+void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
+                                            const MachineFunction &MF) const {
+  int End = getIndirectIndexEnd(MF);
+  int Begin = getIndirectIndexBegin(MF);
+
+  if (End == -1)
+    return;
+
+
+  for (int Index = Begin; Index <= End; ++Index)
+    Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 1); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 2); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 3); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 7); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 15); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index));
+}
+
+MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
+                                             unsigned OperandName) const {
+  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
+  if (Idx == -1)
+    return nullptr;
+
+  return &MI.getOperand(Idx);
+}
+
+uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
+  uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
+  if (ST.isAmdHsaOS())
+    RsrcDataFormat |= (1ULL << 56);
+
+  return RsrcDataFormat;
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
new file mode 100644
index 00000000000..64b5120841c
--- /dev/null
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -0,0 +1,391 @@
+//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for SIInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H
+#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H
+
+#include "AMDGPUInstrInfo.h"
+#include "SIDefines.h"
+#include "SIRegisterInfo.h"
+
+namespace llvm {
+
+class SIInstrInfo : public AMDGPUInstrInfo {
+private:
+  const SIRegisterInfo RI;
+
+  unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
+                              MachineRegisterInfo &MRI,
+                              MachineOperand &SuperReg,
+                              const TargetRegisterClass *SuperRC,
+                              unsigned SubIdx,
+                              const TargetRegisterClass *SubRC) const;
+  MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI,
+                                         MachineRegisterInfo &MRI,
+                                         MachineOperand &SuperReg,
+                                         const TargetRegisterClass *SuperRC,
+                                         unsigned SubIdx,
+                                         const TargetRegisterClass *SubRC) const;
+
+  unsigned split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
+                         MachineBasicBlock::iterator MI,
+                         MachineRegisterInfo &MRI,
+                         const TargetRegisterClass *RC,
+                         const MachineOperand &Op) const;
+
+  void swapOperands(MachineBasicBlock::iterator Inst) const;
+
+  void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+                               MachineInstr *Inst, unsigned Opcode) const;
+
+  void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+                                MachineInstr *Inst, unsigned Opcode) const;
+
+  void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+                            MachineInstr *Inst) const;
+  void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+                           MachineInstr *Inst) const;
+
+  void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
+
+  bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
+                                    MachineInstr *MIb) const;
+
+  unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const;
+
+public:
+  explicit SIInstrInfo(const AMDGPUSubtarget &st);
+
+  const SIRegisterInfo &getRegisterInfo() const override {
+    return RI;
+  }
+
+  bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
+                                         AliasAnalysis *AA) const override;
+
+  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                               int64_t &Offset1,
+                               int64_t &Offset2) const override;
+
+  bool getLdStBaseRegImmOfs(MachineInstr *LdSt,
+                            unsigned &BaseReg, unsigned &Offset,
+                            const TargetRegisterInfo *TRI) const final;
+
+  bool shouldClusterLoads(MachineInstr *FirstLdSt,
+                          MachineInstr *SecondLdSt,
+                          unsigned NumLoads) const final;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    RegScavenger *RS,
+                                    unsigned TmpReg,
+                                    unsigned Offset,
+                                    unsigned Size) const;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+
+  // \brief Returns an opcode that can be used to move a value to a \p DstRC
+  // register.  If there is no hardware instruction that can store to \p
+  // DstRC, then AMDGPU::COPY is returned.
+  unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
+  unsigned commuteOpcode(const MachineInstr &MI) const;
+
+  MachineInstr *commuteInstruction(MachineInstr *MI,
+                                   bool NewMI = false) const override;
+  bool findCommutedOpIndices(MachineInstr *MI,
+                             unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const override;
+
+  bool isTriviallyReMaterializable(const MachineInstr *MI,
+                                   AliasAnalysis *AA = nullptr) const;
+
+  bool areMemAccessesTriviallyDisjoint(
+    MachineInstr *MIa, MachineInstr *MIb,
+    AliasAnalysis *AA = nullptr) const override;
+
+  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DstReg, unsigned SrcReg) const override;
+  bool isMov(unsigned Opcode) const override;
+
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
+
+  bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+                     unsigned Reg, MachineRegisterInfo *MRI) const final;
+
+  unsigned getMachineCSELookAheadLimit() const override { return 500; }
+
+  bool isSALU(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SALU;
+  }
+
+  bool isVALU(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VALU;
+  }
+
+  bool isSOP1(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOP1;
+  }
+
+  bool isSOP2(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOP2;
+  }
+
+  bool isSOPC(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOPC;
+  }
+
+  bool isSOPK(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOPK;
+  }
+
+  bool isSOPP(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOPP;
+  }
+
+  bool isVOP1(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOP1;
+  }
+
+  bool isVOP2(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOP2;
+  }
+
+  bool isVOP3(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOP3;
+  }
+
+  bool isVOPC(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOPC;
+  }
+
+  bool isMUBUF(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
+  }
+
+  bool isMTBUF(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
+  }
+
+  bool isSMRD(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SMRD;
+  }
+
+  bool isDS(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::DS;
+  }
+
+  bool isMIMG(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::MIMG;
+  }
+
+  bool isFLAT(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::FLAT;
+  }
+
+  bool isWQM(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::WQM;
+  }
+
+  bool isVGPRSpill(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
+  }
+
+  bool isInlineConstant(const APInt &Imm) const;
+  bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const;
+  bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;
+
+  bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
+                         const MachineOperand &MO) const;
+
+  /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
+  /// This function will return false if you pass it a 32-bit instruction.
+  bool hasVALU32BitEncoding(unsigned Opcode) const;
+
+  /// \brief Returns true if this operand uses the constant bus.
+  bool usesConstantBus(const MachineRegisterInfo &MRI,
+                       const MachineOperand &MO,
+                       unsigned OpSize) const;
+
+  /// \brief Return true if this instruction has any modifiers.
+  ///  e.g. src[012]_mod, omod, clamp.
+  bool hasModifiers(unsigned Opcode) const;
+
+  bool hasModifiersSet(const MachineInstr &MI,
+                       unsigned OpName) const;
+
+  bool verifyInstruction(const MachineInstr *MI,
+                         StringRef &ErrInfo) const override;
+
+  static unsigned getVALUOp(const MachineInstr &MI);
+
+  bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
+
+  /// \brief Return the correct register class for \p OpNo.  For target-specific
+  /// instructions, this will return the register class that has been defined
+  /// in tablegen.  For generic instructions, like REG_SEQUENCE it will return
+  /// the register class of its machine operand.
+  /// to infer the correct register class base on the other operands.
+  const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
+                                           unsigned OpNo) const;
+
+  /// \brief Return the size in bytes of the operand OpNo on the given
+  // instruction opcode.
+  unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const {
+    const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo];
+
+    if (OpInfo.RegClass == -1) {
+      // If this is an immediate operand, this must be a 32-bit literal.
+      assert(OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE);
+      return 4;
+    }
+
+    return RI.getRegClass(OpInfo.RegClass)->getSize();
+  }
+
+  /// \brief This form should usually be preferred since it handles operands
+  /// with unknown register classes.
+  unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
+    return getOpRegClass(MI, OpNo)->getSize();
+  }
+
+  /// \returns true if it is legal for the operand at index \p OpNo
+  /// to read a VGPR.
+  bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const;
+
+  /// \brief Legalize the \p OpIndex operand of this instruction by inserting
+  /// a MOV.  For example:
+  /// ADD_I32_e32 VGPR0, 15
+  /// to
+  /// MOV VGPR1, 15
+  /// ADD_I32_e32 VGPR0, VGPR1
+  ///
+  /// If the operand being legalized is a register, then a COPY will be used
+  /// instead of MOV.
+  void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const;
+
+  /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand
+  /// for \p MI.
+  bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
+                      const MachineOperand *MO = nullptr) const;
+
+  /// \brief Legalize all operands in this instruction.  This function may
+  /// create new instruction and insert them before \p MI.
+  void legalizeOperands(MachineInstr *MI) const;
+
+  /// \brief Split an SMRD instruction into two smaller loads of half the
+  //  size storing the results in \p Lo and \p Hi.
+  void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC,
+                 unsigned HalfImmOp, unsigned HalfSGPROp,
+                 MachineInstr *&Lo, MachineInstr *&Hi) const;
+
+  void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
+
+  /// \brief Replace this instruction's opcode with the equivalent VALU
+  /// opcode.  This function will also move the users of \p MI to the
+  /// VALU if necessary.
+  void moveToVALU(MachineInstr &MI) const;
+
+  unsigned calculateIndirectAddress(unsigned RegIndex,
+                                    unsigned Channel) const override;
+
+  const TargetRegisterClass *getIndirectAddrRegClass() const override;
+
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned ValueReg,
+                                         unsigned Address,
+                                         unsigned OffsetReg) const override;
+
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg,
+                                        unsigned Address,
+                                        unsigned OffsetReg) const override;
+  void reserveIndirectRegisters(BitVector &Reserved,
+                                const MachineFunction &MF) const;
+
+  void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
+              unsigned SavReg, unsigned IndexReg) const;
+
+  void insertNOPs(MachineBasicBlock::iterator MI, int Count) const;
+
+  /// \brief Returns the operand named \p Op.  If \p MI does not have an
+  /// operand named \c Op, this function returns nullptr.
+  MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const;
+
+  const MachineOperand *getNamedOperand(const MachineInstr &MI,
+                                        unsigned OpName) const {
+    return getNamedOperand(const_cast<MachineInstr &>(MI), OpName);
+  }
+
+  uint64_t getDefaultRsrcDataFormat() const;
+
+};
+
+namespace AMDGPU {
+
+  int getVOPe64(uint16_t Opcode);
+  int getVOPe32(uint16_t Opcode);
+  int getCommuteRev(uint16_t Opcode);
+  int getCommuteOrig(uint16_t Opcode);
+  int getAddr64Inst(uint16_t Opcode);
+  int getAtomicRetOp(uint16_t Opcode);
+  int getAtomicNoRetOp(uint16_t Opcode);
+
+  const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
+  const uint64_t RSRC_TID_ENABLE = 1LL << 55;
+
+} // End namespace AMDGPU
+
+namespace SI {
+namespace KernelInputOffsets {
+
+/// Offsets in bytes from the start of the input buffer
+enum Offsets {
+  NGROUPS_X = 0,
+  NGROUPS_Y = 4,
+  NGROUPS_Z = 8,
+  GLOBAL_SIZE_X = 12,
+  GLOBAL_SIZE_Y = 16,
+  GLOBAL_SIZE_Z = 20,
+  LOCAL_SIZE_X = 24,
+  LOCAL_SIZE_Y = 28,
+  LOCAL_SIZE_Z = 32
+};
+
+} // End namespace KernelInputOffsets
+} // End namespace SI
+
+} // End namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
new file mode 100644
index 00000000000..93e4ca74ec3
--- /dev/null
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -0,0 +1,2647 @@
+//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+def isCI : Predicate<"Subtarget->getGeneration() "
+                      ">= AMDGPUSubtarget::SEA_ISLANDS">;
+def isVI : Predicate <
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
+  AssemblerPredicate<"FeatureGCN3Encoding">;
+
+def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
+
+class vop {
+  field bits<9> SI3;
+  field bits<10> VI3;
+}
+
+class vopc <bits<8> si, bits<8> vi = !add(0x40, si)> : vop {
+  field bits<8> SI = si;
+  field bits<8> VI = vi;
+
+  field bits<9>  SI3 = {0, si{7-0}};
+  field bits<10> VI3 = {0, 0, vi{7-0}};
+}
+
+class vop1 <bits<8> si, bits<8> vi = si> : vop {
+  field bits<8> SI = si;
+  field bits<8> VI = vi;
+
+  field bits<9>  SI3 = {1, 1, si{6-0}};
+  field bits<10> VI3 = !add(0x140, vi);
+}
+
+class vop2 <bits<6> si, bits<6> vi = si> : vop {
+  field bits<6> SI = si;
+  field bits<6> VI = vi;
+
+  field bits<9>  SI3 = {1, 0, 0, si{5-0}};
+  field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}};
+}
+
+// Specify a VOP2 opcode for SI and VOP3 opcode for VI
+// that doesn't have VOP2 encoding on VI
+class vop23 <bits<6> si, bits<10> vi> : vop2 <si> {
+  let VI3 = vi;
+}
+
+class vop3 <bits<9> si, bits<10> vi = {0, si}> : vop {
+  let SI3 = si;
+  let VI3 = vi;
+}
+
+class sop1 <bits<8> si, bits<8> vi = si> {
+  field bits<8> SI = si;
+  field bits<8> VI = vi;
+}
+
+class sop2 <bits<7> si, bits<7> vi = si> {
+  field bits<7> SI = si;
+  field bits<7> VI = vi;
+}
+
+class sopk <bits<5> si, bits<5> vi = si> {
+  field bits<5> SI = si;
+  field bits<5> VI = vi;
+}
+
+// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
+// in AMDGPUInstrInfo.cpp
+def SISubtarget {
+  int NONE = -1;
+  int SI = 0;
+  int VI = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// SI DAG Nodes
+//===----------------------------------------------------------------------===//
+
+def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
+  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
+                      [SDNPMayLoad, SDNPMemOperand]
+>;
+
+def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
+  SDTypeProfile<0, 13,
+    [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
+     SDTCisVT<1, iAny>,   // vdata(VGPR)
+     SDTCisVT<2, i32>,    // num_channels(imm)
+     SDTCisVT<3, i32>,    // vaddr(VGPR)
+     SDTCisVT<4, i32>,    // soffset(SGPR)
+     SDTCisVT<5, i32>,    // inst_offset(imm)
+     SDTCisVT<6, i32>,    // dfmt(imm)
+     SDTCisVT<7, i32>,    // nfmt(imm)
+     SDTCisVT<8, i32>,    // offen(imm)
+     SDTCisVT<9, i32>,    // idxen(imm)
+     SDTCisVT<10, i32>,   // glc(imm)
+     SDTCisVT<11, i32>,   // slc(imm)
+     SDTCisVT<12, i32>    // tfe(imm)
+    ]>,
+  [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
+  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>,
+                       SDTCisVT<3, i32>]>
+>;
+
+class SDSample<string opcode> : SDNode <opcode,
+  SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>,
+                       SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
+>;
+
+def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
+def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
+def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
+def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
+
+def SIconstdata_ptr : SDNode<
+  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
+>;
+
+//===----------------------------------------------------------------------===//
+// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
+// to be glued to the memory instructions.
+//===----------------------------------------------------------------------===//
+
+def SIld_local : SDNode <"ISD::LOAD", SDTLoad,
+  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
+def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{
+  return isLocalLoad(cast<LoadSDNode>(N));
+}]>;
+
+def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED &&
+         cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+}]>;
+
+def si_load_local_align8 : Aligned8Bytes <
+  (ops node:$ptr), (si_load_local node:$ptr)
+>;
+
+def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
+  return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
+}]>;
+def si_az_extload_local : AZExtLoadBase <si_ld_local>;
+
+multiclass SIExtLoadLocal <PatFrag ld_node> {
+
+  def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
+                     [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;}]
+  >;
+
+  def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
+                     [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;}]
+  >;
+}
+
+defm si_sextload_local : SIExtLoadLocal <si_sextload_local>;
+defm si_az_extload_local : SIExtLoadLocal <si_az_extload_local>;
+
+def SIst_local : SDNode <"ISD::STORE", SDTStore,
+  [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
+>;
+
+def si_st_local : PatFrag <
+  (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{
+  return isLocalStore(cast<StoreSDNode>(N));
+}]>;
+
+def si_store_local : PatFrag <
+  (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED &&
+         !cast<StoreSDNode>(N)->isTruncatingStore();
+}]>;
+
+def si_store_local_align8 : Aligned8Bytes <
+  (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr)
+>;
+
+def si_truncstore_local : PatFrag <
+  (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->isTruncatingStore();
+}]>;
+
+def si_truncstore_local_i8 : PatFrag <
+  (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def si_truncstore_local_i16 : PatFrag <
+  (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+multiclass SIAtomicM0Glue2 <string op_name> {
+
+  def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2,
+    [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+  >;
+
+  def _local : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+}
+
+defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
+defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
+defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
+defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
+defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">;
+defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
+defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
+defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
+defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
+defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">;
+
+def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
+  [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
+defm si_atomic_cmp_swap : AtomicCmpSwapLocal <si_atomic_cmp_swap_glue>;
+
+// Transformation function, extract the lower 32bit of a 64bit immediate
+def LO32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+def LO32f : SDNodeXForm<fpimm, [{
+  APInt V = N->getValueAPF().bitcastToAPInt().trunc(32);
+  return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32);
+}]>;
+
+// Transformation function, extract the upper 32bit of a 64bit immediate
+def HI32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() >> 32, SDLoc(N), MVT::i32);
+}]>;
+
+def HI32f : SDNodeXForm<fpimm, [{
+  APInt V = N->getValueAPF().bitcastToAPInt().lshr(32).trunc(32);
+  return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), SDLoc(N),
+                                     MVT::f32);
+}]>;
+
+def IMM8bitDWORD : PatLeaf <(imm),
+  [{return (N->getZExtValue() & ~0x3FC) == 0;}]
+>;
+
+def as_dword_i32imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() >> 2, SDLoc(N), MVT::i32);
+}]>;
+
+def as_i1imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
+}]>;
+
+def as_i8imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i8);
+}]>;
+
+def as_i16imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16);
+}]>;
+
+def as_i32imm: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def as_i64imm: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64);
+}]>;
+
+// Copied from the AArch64 backend:
+def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+  N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+// Copied from the AArch64 backend:
+def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+  N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
+
+def IMM8bit : PatLeaf <(imm),
+  [{return isUInt<8>(N->getZExtValue());}]
+>;
+
+def IMM12bit : PatLeaf <(imm),
+  [{return isUInt<12>(N->getZExtValue());}]
+>;
+
+def IMM16bit : PatLeaf <(imm),
+  [{return isUInt<16>(N->getZExtValue());}]
+>;
+
+def IMM20bit : PatLeaf <(imm),
+  [{return isUInt<20>(N->getZExtValue());}]
+>;
+
+def IMM32bit : PatLeaf <(imm),
+  [{return isUInt<32>(N->getZExtValue());}]
+>;
+
+def mubuf_vaddr_offset : PatFrag<
+  (ops node:$ptr, node:$offset, node:$imm_offset),
+  (add (add node:$ptr, node:$offset), node:$imm_offset)
+>;
+
+class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
+  return isInlineImmediate(N);
+}]>;
+
+class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
+  return isInlineImmediate(N);
+}]>;
+
+class SGPRImm <dag frag> : PatLeaf<frag, [{
+  if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    return false;
+  }
+  const SIRegisterInfo *SIRI =
+      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+  for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+                                                U != E; ++U) {
+    if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) {
+      return true;
+    }
+  }
+  return false;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Custom Operands
+//===----------------------------------------------------------------------===//
+
+def FRAMEri32 : Operand<iPTR> {
+  let MIOperandInfo = (ops i32:$ptr, i32imm:$index);
+}
+
+def SoppBrTarget : AsmOperandClass {
+  let Name = "SoppBrTarget";
+  let ParserMethod = "parseSOppBrTarget";
+}
+
+def sopp_brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getSOPPBrEncoding";
+  let OperandType = "OPERAND_PCREL";
+  let ParserMatchClass = SoppBrTarget;
+}
+
+include "SIInstrFormats.td"
+include "VIInstrFormats.td"
+
+def MubufOffsetMatchClass : AsmOperandClass {
+  let Name = "MubufOffset";
+  let ParserMethod = "parseMubufOptionalOps";
+  let RenderMethod = "addImmOperands";
+}
+
+class DSOffsetBaseMatchClass <string parser> : AsmOperandClass {
+  let Name = "DSOffset"#parser;
+  let ParserMethod = parser;
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isDSOffset";
+}
+
+def DSOffsetMatchClass : DSOffsetBaseMatchClass <"parseDSOptionalOps">;
+def DSOffsetGDSMatchClass : DSOffsetBaseMatchClass <"parseDSOffsetOptional">;
+
+def DSOffset01MatchClass : AsmOperandClass {
+  let Name = "DSOffset1";
+  let ParserMethod = "parseDSOff01OptionalOps";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isDSOffset01";
+}
+
+class GDSBaseMatchClass <string parser> : AsmOperandClass {
+  let Name = "GDS"#parser;
+  let PredicateMethod = "isImm";
+  let ParserMethod = parser;
+  let RenderMethod = "addImmOperands";
+}
+
+def GDSMatchClass : GDSBaseMatchClass <"parseDSOptionalOps">;
+def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">;
+
+class GLCBaseMatchClass <string parser> : AsmOperandClass {
+  let Name = "GLC"#parser;
+  let PredicateMethod = "isImm";
+  let ParserMethod = parser; 
+  let RenderMethod = "addImmOperands";
+}
+
+def GLCMubufMatchClass : GLCBaseMatchClass <"parseMubufOptionalOps">;
+def GLCFlatMatchClass : GLCBaseMatchClass <"parseFlatOptionalOps">;
+
+class SLCBaseMatchClass <string parser> : AsmOperandClass {
+  let Name = "SLC"#parser;
+  let PredicateMethod = "isImm";
+  let ParserMethod = parser;
+  let RenderMethod = "addImmOperands";
+}
+
+def SLCMubufMatchClass : SLCBaseMatchClass <"parseMubufOptionalOps">;
+def SLCFlatMatchClass : SLCBaseMatchClass <"parseFlatOptionalOps">;
+def SLCFlatAtomicMatchClass : SLCBaseMatchClass <"parseFlatAtomicOptionalOps">;
+
+class TFEBaseMatchClass <string parser> : AsmOperandClass {
+  let Name = "TFE"#parser;
+  let PredicateMethod = "isImm";
+  let ParserMethod = parser;
+  let RenderMethod = "addImmOperands";
+}
+
+def TFEMubufMatchClass : TFEBaseMatchClass <"parseMubufOptionalOps">;
+def TFEFlatMatchClass : TFEBaseMatchClass <"parseFlatOptionalOps">;
+def TFEFlatAtomicMatchClass : TFEBaseMatchClass <"parseFlatAtomicOptionalOps">;
+
+def OModMatchClass : AsmOperandClass {
+  let Name = "OMod";
+  let PredicateMethod = "isImm";
+  let ParserMethod = "parseVOP3OptionalOps";
+  let RenderMethod = "addImmOperands";
+}
+
+def ClampMatchClass : AsmOperandClass {
+  let Name = "Clamp";
+  let PredicateMethod = "isImm";
+  let ParserMethod = "parseVOP3OptionalOps";
+  let RenderMethod = "addImmOperands";
+}
+
+let OperandType = "OPERAND_IMMEDIATE" in {
+
+def offen : Operand<i1> {
+  let PrintMethod = "printOffen";
+}
+def idxen : Operand<i1> {
+  let PrintMethod = "printIdxen";
+}
+def addr64 : Operand<i1> {
+  let PrintMethod = "printAddr64";
+}
+def mbuf_offset : Operand<i16> {
+  let PrintMethod = "printMBUFOffset";
+  let ParserMatchClass = MubufOffsetMatchClass;
+}
+class ds_offset_base <AsmOperandClass mc> : Operand<i16> {
+  let PrintMethod = "printDSOffset";
+  let ParserMatchClass = mc;
+}
+def ds_offset : ds_offset_base <DSOffsetMatchClass>;
+def ds_offset_gds : ds_offset_base <DSOffsetGDSMatchClass>;
+
+def ds_offset0 : Operand<i8> {
+  let PrintMethod = "printDSOffset0";
+  let ParserMatchClass = DSOffset01MatchClass;
+}
+def ds_offset1 : Operand<i8> {
+  let PrintMethod = "printDSOffset1";
+  let ParserMatchClass = DSOffset01MatchClass;
+}
+class gds_base <AsmOperandClass mc> : Operand <i1> {
+  let PrintMethod = "printGDS";
+  let ParserMatchClass = mc;
+}
+def gds : gds_base <GDSMatchClass>;
+
+def gds01 : gds_base <GDS01MatchClass>;
+
+class glc_base <AsmOperandClass mc> : Operand <i1> {
+  let PrintMethod = "printGLC";
+  let ParserMatchClass = mc;
+}
+
+def glc : glc_base <GLCMubufMatchClass>;
+def glc_flat : glc_base <GLCFlatMatchClass>;
+
+class slc_base <AsmOperandClass mc> : Operand <i1> {
+  let PrintMethod = "printSLC";
+  let ParserMatchClass = mc;
+}
+
+def slc : slc_base <SLCMubufMatchClass>;
+def slc_flat : slc_base <SLCFlatMatchClass>;
+def slc_flat_atomic : slc_base <SLCFlatAtomicMatchClass>;
+
+class tfe_base <AsmOperandClass mc> : Operand <i1> {
+  let PrintMethod = "printTFE";
+  let ParserMatchClass = mc;
+}
+
+def tfe : tfe_base <TFEMubufMatchClass>;
+def tfe_flat : tfe_base <TFEFlatMatchClass>;
+def tfe_flat_atomic : tfe_base <TFEFlatAtomicMatchClass>;
+
+def omod : Operand <i32> {
+  let PrintMethod = "printOModSI";
+  let ParserMatchClass = OModMatchClass;
+}
+
+def ClampMod : Operand <i1> {
+  let PrintMethod = "printClampSI";
+  let ParserMatchClass = ClampMatchClass;
+}
+
+} // End OperandType = "OPERAND_IMMEDIATE"
+
+def VOPDstS64 : VOPDstOperand <SReg_64>;
+
+//===----------------------------------------------------------------------===//
+// Complex patterns
+//===----------------------------------------------------------------------===//
+
+def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
+def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
+
+def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
+def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
+def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
+def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
+def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
+def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
+
+def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
+def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
+def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;
+def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
+
+//===----------------------------------------------------------------------===//
+// SI assembler operands
+//===----------------------------------------------------------------------===//
+
+def SIOperand {
+  int ZERO = 0x80;
+  int VCC = 0x6A;
+  int FLAT_SCR = 0x68;
+}
+
+def SRCMODS {
+  int NONE = 0;
+  int NEG = 1;
+}
+
+def DSTCLAMP {
+  int NONE = 0;
+}
+
+def DSTOMOD {
+  int NONE = 0;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// SI Instruction multiclass helpers.
+//
+// Instructions with _32 take 32-bit operands.
+// Instructions with _64 take 64-bit operands.
+//
+// VOP_* instructions can use either a 32-bit or 64-bit encoding.  The 32-bit
+// encoding is the standard encoding, but instruction that make use of
+// any of the instruction modifiers must use the 64-bit encoding.
+//
+// Instructions with _e32 use the 32-bit encoding.
+// Instructions with _e64 use the 64-bit encoding.
+//
+//===----------------------------------------------------------------------===//
+
+class SIMCInstr <string pseudo, int subtarget> {
+  string PseudoInstr = pseudo;
+  int Subtarget = subtarget;
+}
+
+//===----------------------------------------------------------------------===//
+// EXP classes
+//===----------------------------------------------------------------------===//
+
+class EXPCommon : InstSI<
+  (outs),
+  (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
+       VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3),
+  "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
+  [] > {
+
+  let EXP_CNT = 1;
+  let Uses = [EXEC];
+}
+
+multiclass EXP_m {
+
+  let isPseudo = 1, isCodeGenOnly = 1 in {
+    def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ;
+  }
+
+  def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe;
+
+  def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi;
+}
+
+//===----------------------------------------------------------------------===//
+// Scalar classes
+//===----------------------------------------------------------------------===//
+
+class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  SOP1 <outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm> :
+  SOP1 <outs, ins, asm, []>,
+  SOP1e <op.SI>,
+  SIMCInstr<opName, SISubtarget.SI> {
+  let isCodeGenOnly = 0;
+  let AssemblerPredicates = [isSICI];
+}
+
+class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm> :
+  SOP1 <outs, ins, asm, []>,
+  SOP1e <op.VI>,
+  SIMCInstr<opName, SISubtarget.VI> {
+  let isCodeGenOnly = 0;
+  let AssemblerPredicates = [isVI];
+}
+
+multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm,
+                   list<dag> pattern> {
+
+  def "" : SOP1_Pseudo <opName, outs, ins, pattern>;
+
+  def _si : SOP1_Real_si <op, opName, outs, ins, asm>;
+
+  def _vi : SOP1_Real_vi <op, opName, outs, ins, asm>;
+
+}
+
+multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
+    op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0),
+    opName#" $dst, $src0", pattern
+>;
+
+multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
+    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0),
+    opName#" $dst, $src0", pattern
+>;
+
+// no input, 64-bit output.
+multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> {
+  def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins), pattern>;
+
+  def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins),
+    opName#" $dst"> {
+    let ssrc0 = 0;
+  }
+
+  def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins),
+    opName#" $dst"> {
+    let ssrc0 = 0;
+  }
+}
+
+// 64-bit input, no output
+multiclass SOP1_1 <sop1 op, string opName, list<dag> pattern> {
+  def "" : SOP1_Pseudo <opName, (outs), (ins SReg_64:$src0), pattern>;
+
+  def _si : SOP1_Real_si <op, opName, (outs), (ins SReg_64:$src0),
+    opName#" $src0"> {
+    let sdst = 0;
+  }
+
+  def _vi : SOP1_Real_vi <op, opName, (outs), (ins SReg_64:$src0),
+    opName#" $src0"> {
+    let sdst = 0;
+  }
+}
+
+// 64-bit input, 32-bit output.
+multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
+    op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0),
+    opName#" $dst, $src0", pattern
+>;
+
+class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> :
+  SOP2<outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let Size = 4;
+
+  // Pseudo instructions have no encodings, but adding this field here allows
+  // us to do:
+  // let sdst = xxx in {
+  // for multiclasses that include both real and pseudo instructions.
+  field bits<7> sdst = 0;
+}
+
+class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm> :
+  SOP2<outs, ins, asm, []>,
+  SOP2e<op.SI>,
+  SIMCInstr<opName, SISubtarget.SI> {
+  let AssemblerPredicates = [isSICI];
+}
+
+class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> :
+  SOP2<outs, ins, asm, []>,
+  SOP2e<op.VI>,
+  SIMCInstr<opName, SISubtarget.VI> {
+  let AssemblerPredicates = [isVI];
+}
+
+multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> {
+  def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst),
+    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>;
+
+  def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst),
+    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
+    opName#" $dst, $src0, $src1 [$scc]">;
+
+  def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst),
+    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
+    opName#" $dst, $src0, $src1 [$scc]">;
+}
+
+multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm,
+                   list<dag> pattern> {
+
+  def "" : SOP2_Pseudo <opName, outs, ins, pattern>;
+
+  def _si : SOP2_Real_si <op, opName, outs, ins, asm>;
+
+  def _vi : SOP2_Real_vi <op, opName, outs, ins, asm>;
+
+}
+
+multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
+    op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
+    opName#" $dst, $src0, $src1", pattern
+>;
+
+multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
+    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
+    opName#" $dst, $src0, $src1", pattern
+>;
+
+multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
+    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
+    opName#" $dst, $src0, $src1", pattern
+>;
+
+class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
+                    string opName, PatLeaf cond> : SOPC <
+  op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1),
+  opName#" $src0, $src1", []>;
+
+class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
+  : SOPC_Helper<op, SSrc_32, i32, opName, cond>;
+
+class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL>
+  : SOPC_Helper<op, SSrc_64, i64, opName, cond>;
+
+class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  SOPK <outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm> :
+  SOPK <outs, ins, asm, []>,
+  SOPKe <op.SI>,
+  SIMCInstr<opName, SISubtarget.SI> {
+  let AssemblerPredicates = [isSICI];
+  let isCodeGenOnly = 0;
+}
+
+class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm> :
+  SOPK <outs, ins, asm, []>,
+  SOPKe <op.VI>,
+  SIMCInstr<opName, SISubtarget.VI> {
+  let AssemblerPredicates = [isVI];
+  let isCodeGenOnly = 0;
+}
+
+multiclass SOPK_m <sopk op, string opName, dag outs, dag ins, string opAsm,
+                   string asm = opName#opAsm> {
+  def "" : SOPK_Pseudo <opName, outs, ins, []>;
+
+  def _si : SOPK_Real_si <op, opName, outs, ins, asm>;
+
+  def _vi : SOPK_Real_vi <op, opName, outs, ins, asm>;
+
+}
+
+multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> {
+  def "" : SOPK_Pseudo <opName, (outs SReg_32:$dst), (ins u16imm:$src0),
+    pattern>;
+
+  def _si : SOPK_Real_si <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0),
+    opName#" $dst, $src0">;
+
+  def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0),
+    opName#" $dst, $src0">;
+}
+
+multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> {
+  def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst),
+    (ins SReg_32:$src0, u16imm:$src1), pattern>;
+
+  let DisableEncoding = "$dst" in {
+    def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst),
+      (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">;
+
+    def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst),
+      (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">;
+  }
+}
+
+multiclass SOPK_32TIE <sopk op, string opName, list<dag> pattern> : SOPK_m <
+  op, opName, (outs SReg_32:$sdst), (ins SReg_32:$src0, u16imm:$simm16),
+  " $sdst, $simm16"
+>;
+
+multiclass SOPK_IMM32 <sopk op, string opName, dag outs, dag ins,
+                       string argAsm, string asm = opName#argAsm> {
+
+  def "" : SOPK_Pseudo <opName, outs, ins, []>;
+
+  def _si : SOPK <outs, ins, asm, []>,
+            SOPK64e <op.SI>,
+            SIMCInstr<opName, SISubtarget.SI> {
+              let AssemblerPredicates = [isSICI];
+              let isCodeGenOnly = 0;
+            }
+
+  def _vi : SOPK <outs, ins, asm, []>,
+            SOPK64e <op.VI>,
+            SIMCInstr<opName, SISubtarget.VI> {
+              let AssemblerPredicates = [isVI];
+              let isCodeGenOnly = 0;
+            }
+}
+//===----------------------------------------------------------------------===//
+// SMRD classes
+//===----------------------------------------------------------------------===//
+
+class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  SMRD <outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins,
+                    string asm> :
+  SMRD <outs, ins, asm, []>,
+  SMRDe <op, imm>,
+  SIMCInstr<opName, SISubtarget.SI> {
+  let AssemblerPredicates = [isSICI];
+}
+
+class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins,
+                    string asm> :
+  SMRD <outs, ins, asm, []>,
+  SMEMe_vi <op, imm>,
+  SIMCInstr<opName, SISubtarget.VI> {
+  let AssemblerPredicates = [isVI];
+}
+
+multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins,
+                   string asm, list<dag> pattern> {
+
+  def "" : SMRD_Pseudo <opName, outs, ins, pattern>;
+
+  def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>;
+
+  // glc is only applicable to scalar stores, which are not yet
+  // implemented.
+  let glc = 0 in {
+    def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>;
+  }
+}
+
+multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass,
+                        RegisterClass dstClass> {
+  defm _IMM : SMRD_m <
+    op, opName#"_IMM", 1, (outs dstClass:$dst),
+    (ins baseClass:$sbase, u32imm:$offset),
+    opName#" $dst, $sbase, $offset", []
+  >;
+
+  defm _SGPR : SMRD_m <
+    op, opName#"_SGPR", 0, (outs dstClass:$dst),
+    (ins baseClass:$sbase, SReg_32:$soff),
+    opName#" $dst, $sbase, $soff", []
+  >;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector ALU classes
+//===----------------------------------------------------------------------===//
+
+// This must always be right before the operand being input modified.
+def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> {
+  let PrintMethod = "printOperandAndMods";
+}
+
+def InputModsMatchClass : AsmOperandClass {
+  let Name = "RegWithInputMods";
+}
+
+def InputModsNoDefault : Operand <i32> {
+  let PrintMethod = "printOperandAndMods";
+  let ParserMatchClass = InputModsMatchClass;
+}
+
+class getNumSrcArgs<ValueType Src1, ValueType Src2> {
+  int ret =
+    !if (!eq(Src1.Value, untyped.Value),      1,   // VOP1
+         !if (!eq(Src2.Value, untyped.Value), 2,   // VOP2
+                                              3)); // VOP3
+}
+
+// Returns the register class to use for the destination of VOP[123C]
+// instructions for the given VT.
+class getVALUDstForVT<ValueType VT> {
+  RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
+                          !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
+                            VOPDstOperand<SReg_64>)); // else VT == i1
+}
+
+// Returns the register class to use for source 0 of VOP[12C]
+// instructions for the given VT.
+class getVOPSrc0ForVT<ValueType VT> {
+  RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64);
+}
+
+// Returns the register class to use for source 1 of VOP[12C] for the
+// given VT.
+class getVOPSrc1ForVT<ValueType VT> {
+  RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64);
+}
+
+// Returns the register class to use for sources of VOP3 instructions for the
+// given VT.
+class getVOP3SrcForVT<ValueType VT> {
+  RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64);
+}
+
+// Returns 1 if the source arguments have modifiers, 0 if they do not.
+class hasModifiers<ValueType SrcVT> {
+  bit ret = !if(!eq(SrcVT.Value, f32.Value), 1,
+            !if(!eq(SrcVT.Value, f64.Value), 1, 0));
+}
+
+// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
+class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
+  dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0),               // VOP1
+            !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2
+                                    (ins)));
+}
+
+// Returns the input arguments for VOP3 instructions for the given SrcVT.
+class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
+                RegisterOperand Src2RC, int NumSrcArgs,
+                bit HasModifiers> {
+
+  dag ret =
+    !if (!eq(NumSrcArgs, 1),
+      !if (!eq(HasModifiers, 1),
+        // VOP1 with modifiers
+        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
+             ClampMod:$clamp, omod:$omod)
+      /* else */,
+        // VOP1 without modifiers
+        (ins Src0RC:$src0)
+      /* endif */ ),
+    !if (!eq(NumSrcArgs, 2),
+      !if (!eq(HasModifiers, 1),
+        // VOP 2 with modifiers
+        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
+             InputModsNoDefault:$src1_modifiers, Src1RC:$src1,
+             ClampMod:$clamp, omod:$omod)
+      /* else */,
+        // VOP2 without modifiers
+        (ins Src0RC:$src0, Src1RC:$src1)
+      /* endif */ )
+    /* NumSrcArgs == 3 */,
+      !if (!eq(HasModifiers, 1),
+        // VOP3 with modifiers
+        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
+             InputModsNoDefault:$src1_modifiers, Src1RC:$src1,
+             InputModsNoDefault:$src2_modifiers, Src2RC:$src2,
+             ClampMod:$clamp, omod:$omod)
+      /* else */,
+        // VOP3 without modifiers
+        (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2)
+      /* endif */ )));
+}
+
+// Returns the assembly string for the inputs and outputs of a VOP[12C]
+// instruction.  This does not add the _e32 suffix, so it can be reused
+// by getAsm64.
+class getAsm32 <int NumSrcArgs> {
+  string src1 = ", $src1";
+  string src2 = ", $src2";
+  string ret = "$dst, $src0"#
+               !if(!eq(NumSrcArgs, 1), "", src1)#
+               !if(!eq(NumSrcArgs, 3), src2, "");
+}
+
+// Returns the assembly string for the inputs and outputs of a VOP3
+// instruction.
+class getAsm64 <int NumSrcArgs, bit HasModifiers> {
+  string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
+  string src1 = !if(!eq(NumSrcArgs, 1), "",
+                   !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+                                           " $src1_modifiers,"));
+  string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
+  string ret =
+  !if(!eq(HasModifiers, 0),
+      getAsm32<NumSrcArgs>.ret,
+      "$dst, "#src0#src1#src2#"$clamp"#"$omod");
+}
+
+
+class VOPProfile <list<ValueType> _ArgVT> {
+
+  field list<ValueType> ArgVT = _ArgVT;
+
+  field ValueType DstVT = ArgVT[0];
+  field ValueType Src0VT = ArgVT[1];
+  field ValueType Src1VT = ArgVT[2];
+  field ValueType Src2VT = ArgVT[3];
+  field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
+  field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
+  field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret;
+  field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
+  field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
+  field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
+
+  field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret;
+  field bit HasModifiers = hasModifiers<Src0VT>.ret;
+
+  field dag Outs = (outs DstRC:$dst);
+
+  field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
+  field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+                             HasModifiers>.ret;
+
+  field string Asm32 = getAsm32<NumSrcArgs>.ret;
+  field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret;
+}
+
+// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order
+//        for the instruction patterns to work.
+def VOP_F16_F16 : VOPProfile <[f32, f32, untyped, untyped]>;
+def VOP_F16_I16 : VOPProfile <[f32, i32, untyped, untyped]>;
+def VOP_I16_F16 : VOPProfile <[i32, f32, untyped, untyped]>;
+
+def VOP_F16_F16_F16 : VOPProfile <[f32, f32, f32, untyped]>;
+def VOP_F16_F16_I16 : VOPProfile <[f32, f32, i32, untyped]>;
+def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
+
+def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
+def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>;
+def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>;
+def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>;
+def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>;
+def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>;
+def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>;
+def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
+def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
+
+def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
+def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>;
+def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>;
+def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
+def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
+def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
+def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> {
+  let Src0RC32 = VCSrc_32;
+}
+
+def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> {
+  let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+  let Asm64 = "$dst, $src0_modifiers, $src1";
+}
+
+def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> {
+  let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+  let Asm64 = "$dst, $src0_modifiers, $src1";
+}
+
+def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
+def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
+def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
+def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> {
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2);
+  let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2);
+  let Asm64 = "$dst, $src0, $src1, $src2";
+}
+
+def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
+def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> {
+  field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2);
+  field string Asm = "$dst, $src0, $vsrc1, $src2";
+}
+def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
+def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
+def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
+
+
+class VOP <string opName> {
+  string OpName = opName;
+}
+
+class VOP2_REV <string revOp, bit isOrig> {
+  string RevOp = revOp;
+  bit IsOrig = isOrig;
+}
+
+class AtomicNoRet <string noRetOp, bit isRet> {
+  string NoRetOp = noRetOp;
+  bit IsRet = isRet;
+}
+
+class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+  VOP1Common <outs, ins, "", pattern>,
+  VOP <opName>,
+  SIMCInstr <opName#"_e32", SISubtarget.NONE>,
+  MnemonicAlias<opName#"_e32", opName> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+
+  field bits<8> vdst;
+  field bits<9> src0;
+}
+
+class VOP1_Real_si <string opName, vop1 op, dag outs, dag ins, string asm> :
+  VOP1<op.SI, outs, ins, asm, []>,
+  SIMCInstr <opName#"_e32", SISubtarget.SI> {
+  let AssemblerPredicate = SIAssemblerPredicate;
+}
+
+class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> :
+  VOP1<op.VI, outs, ins, asm, []>,
+  SIMCInstr <opName#"_e32", SISubtarget.VI> {
+  let AssemblerPredicates = [isVI];
+}
+
+multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName> {
+  def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
+
+  def _si : VOP1_Real_si <opName, op, outs, ins, asm>;
+
+  def _vi : VOP1_Real_vi <opName, op, outs, ins, asm>;
+}
+
+multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName> {
+  def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
+
+  def _si : VOP1_Real_si <opName, op, outs, ins, asm>;
+}
+
+class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+  VOP2Common <outs, ins, "", pattern>,
+  VOP <opName>,
+  SIMCInstr<opName#"_e32", SISubtarget.NONE>,
+  MnemonicAlias<opName#"_e32", opName> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class VOP2_Real_si <string opName, vop2 op, dag outs, dag ins, string asm> :
+  VOP2 <op.SI, outs, ins, opName#asm, []>,
+  SIMCInstr <opName#"_e32", SISubtarget.SI> {
+  let AssemblerPredicates = [isSICI];
+}
+
+class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> :
+  VOP2 <op.VI, outs, ins, opName#asm, []>,
+  SIMCInstr <opName#"_e32", SISubtarget.VI> {
+  let AssemblerPredicates = [isVI];
+}
+
+multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
+                     string opName, string revOp> {
+  def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+           VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+  def _si : VOP2_Real_si <opName, op, outs, ins, asm>;
+}
+
+multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName, string revOp> {
+  def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+           VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+  def _si : VOP2_Real_si <opName, op, outs, ins, asm>;
+
+  def _vi : VOP2_Real_vi <opName, op, outs, ins, asm>;
+
+}
+
+class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> {
+
+  bits<2> src0_modifiers = !if(HasModifiers, ?, 0);
+  bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0);
+  bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ?, 0), 0);
+  bits<2> omod = !if(HasModifiers, ?, 0);
+  bits<1> clamp = !if(HasModifiers, ?, 0);
+  bits<9> src1 = !if(HasSrc1, ?, 0);
+  bits<9> src2 = !if(HasSrc2, ?, 0);
+}
+
+class VOP3DisableModFields <bit HasSrc0Mods,
+                            bit HasSrc1Mods = 0,
+                            bit HasSrc2Mods = 0,
+                            bit HasOutputMods = 0> {
+  bits<2> src0_modifiers = !if(HasSrc0Mods, ?, 0);
+  bits<2> src1_modifiers = !if(HasSrc1Mods, ?, 0);
+  bits<2> src2_modifiers = !if(HasSrc2Mods, ?, 0);
+  bits<2> omod = !if(HasOutputMods, ?, 0);
+  bits<1> clamp = !if(HasOutputMods, ?, 0);
+}
+
+class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+  VOP3Common <outs, ins, "", pattern>,
+  VOP <opName>,
+  SIMCInstr<opName#"_e64", SISubtarget.NONE>,
+  MnemonicAlias<opName#"_e64", opName> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
+  VOP3Common <outs, ins, asm, []>,
+  VOP3e <op>,
+  SIMCInstr<opName#"_e64", SISubtarget.SI> {
+  let AssemblerPredicates = [isSICI];
+}
+
+class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> :
+  VOP3Common <outs, ins, asm, []>,
+  VOP3e_vi <op>,
+  SIMCInstr <opName#"_e64", SISubtarget.VI> {
+  let AssemblerPredicates = [isVI];
+}
+
+class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
+  VOP3Common <outs, ins, asm, []>,
+  VOP3be <op>,
+  SIMCInstr<opName#"_e64", SISubtarget.SI> {
+  let AssemblerPredicates = [isSICI];
+}
+
+class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> :
+  VOP3Common <outs, ins, asm, []>,
+  VOP3be_vi <op>,
+  SIMCInstr <opName#"_e64", SISubtarget.VI> {
+  let AssemblerPredicates = [isVI];
+}
+
+multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName, int NumSrcArgs, bit HasMods = 1> {
+
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+            VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
+                              !if(!eq(NumSrcArgs, 2), 0, 1),
+                              HasMods>;
+  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+            VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
+                              !if(!eq(NumSrcArgs, 2), 0, 1),
+                              HasMods>;
+}
+
+// VOP3_m without source modifiers
+multiclass VOP3_m_nomods <vop op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName, int NumSrcArgs, bit HasMods = 1> {
+
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+  let src0_modifiers = 0,
+      src1_modifiers = 0,
+      src2_modifiers = 0,
+      clamp = 0,
+      omod = 0 in {
+    def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>;
+    def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>;
+  }
+}
+
+multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
+                     list<dag> pattern, string opName, bit HasMods = 1> {
+
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+            VOP3DisableFields<0, 0, HasMods>;
+
+  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+            VOP3DisableFields<0, 0, HasMods>;
+}
+
+multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm,
+                     list<dag> pattern, string opName, bit HasMods = 1> {
+
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+            VOP3DisableFields<0, 0, HasMods>;
+  // No VI instruction. This class is for SI only.
+}
+
+multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
+                     list<dag> pattern, string opName, string revOp,
+                     bit HasMods = 1, bit UseFullOp = 0> {
+
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+            VOP3DisableFields<1, 0, HasMods>;
+
+  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+            VOP3DisableFields<1, 0, HasMods>;
+}
+
+multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
+                     list<dag> pattern, string opName, string revOp,
+                     bit HasMods = 1, bit UseFullOp = 0> {
+
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+            VOP3DisableFields<1, 0, HasMods>;
+
+  // No VI instruction. This class is for SI only.
+}
+
+// XXX - Is v_div_scale_{f32|f64} only available in vop3b without
+// option of implicit vcc use?
+multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm,
+                      list<dag> pattern, string opName, string revOp,
+                      bit HasMods = 1, bit UseFullOp = 0> {
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+  // The VOP2 variant puts the carry out into VCC, the VOP3 variant
+  // can write it into any SGPR. We currently don't use the carry out,
+  // so for now hardcode it to VCC as well.
+  let sdst = SIOperand.VCC, Defs = [VCC] in {
+    def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
+              VOP3DisableFields<1, 0, HasMods>;
+
+    def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
+              VOP3DisableFields<1, 0, HasMods>;
+  } // End sdst = SIOperand.VCC, Defs = [VCC]
+}
+
+multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm,
+                      list<dag> pattern, string opName, string revOp,
+                      bit HasMods = 1, bit UseFullOp = 0> {
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+
+  def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
+            VOP3DisableFields<1, 1, HasMods>;
+
+  def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
+            VOP3DisableFields<1, 1, HasMods>;
+}
+
+multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
+                     list<dag> pattern, string opName,
+                     bit HasMods, bit defExec, string revOp> {
+
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+            VOP3DisableFields<1, 0, HasMods> {
+    let Defs = !if(defExec, [EXEC], []);
+  }
+
+  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+            VOP3DisableFields<1, 0, HasMods> {
+    let Defs = !if(defExec, [EXEC], []);
+  }
+}
+
+// An instruction that is VOP2 on SI and VOP3 on VI, no modifiers.
+multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins,
+                         string asm, list<dag> pattern = []> {
+  let isPseudo = 1, isCodeGenOnly = 1 in {
+    def "" : VOPAnyCommon <outs, ins, "", pattern>,
+             SIMCInstr<opName, SISubtarget.NONE>;
+  }
+
+  def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>,
+            SIMCInstr <opName, SISubtarget.SI> {
+            let AssemblerPredicates = [isSICI];
+  }
+
+  def _vi : VOP3Common <outs, ins, asm, []>,
+            VOP3e_vi <op.VI3>,
+            VOP3DisableFields <1, 0, 0>,
+            SIMCInstr <opName, SISubtarget.VI> {
+            let AssemblerPredicates = [isVI];
+  }
+}
+
+multiclass VOP1_Helper <vop1 op, string opName, dag outs,
+                        dag ins32, string asm32, list<dag> pat32,
+                        dag ins64, string asm64, list<dag> pat64,
+                        bit HasMods> {
+
+  defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>;
+
+  defm _e64 : VOP3_1_m <op, outs, ins64, opName#asm64, pat64, opName, HasMods>;
+}
+
+multiclass VOP1Inst <vop1 op, string opName, VOPProfile P,
+                     SDPatternOperator node = null_frag> : VOP1_Helper <
+  op, opName, P.Outs,
+  P.Ins32, P.Asm32, [],
+  P.Ins64, P.Asm64,
+  !if(P.HasModifiers,
+      [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+                                i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
+      [(set P.DstVT:$dst, (node P.Src0VT:$src0))]),
+  P.HasModifiers
+>;
+
+multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
+                       SDPatternOperator node = null_frag> {
+
+  defm _e32 : VOP1SI_m <op, P.Outs, P.Ins32, opName#P.Asm32, [], opName>;
+
+  defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64,
+    !if(P.HasModifiers,
+      [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+                                i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
+      [(set P.DstVT:$dst, (node P.Src0VT:$src0))]),
+    opName, P.HasModifiers>;
+}
+
+multiclass VOP2_Helper <vop2 op, string opName, dag outs,
+                        dag ins32, string asm32, list<dag> pat32,
+                        dag ins64, string asm64, list<dag> pat64,
+                        string revOp, bit HasMods> {
+  defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+
+  defm _e64 : VOP3_2_m <op,
+    outs, ins64, opName#asm64, pat64, opName, revOp, HasMods
+  >;
+}
+
+multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
+                     SDPatternOperator node = null_frag,
+                     string revOp = opName> : VOP2_Helper <
+  op, opName, P.Outs,
+  P.Ins32, P.Asm32, [],
+  P.Ins64, P.Asm64,
+  !if(P.HasModifiers,
+      [(set P.DstVT:$dst,
+           (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                      i1:$clamp, i32:$omod)),
+                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+  revOp, P.HasModifiers
+>;
+
+multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
+                       SDPatternOperator node = null_frag,
+                       string revOp = opName> {
+  defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>;
+
+  defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64,
+    !if(P.HasModifiers,
+        [(set P.DstVT:$dst,
+             (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                        i1:$clamp, i32:$omod)),
+                   (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+    opName, revOp, P.HasModifiers>;
+}
+
+multiclass VOP2b_Helper <vop2 op, string opName, dag outs,
+                         dag ins32, string asm32, list<dag> pat32,
+                         dag ins64, string asm64, list<dag> pat64,
+                         string revOp, bit HasMods> {
+
+  defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+
+  defm _e64 : VOP3b_2_m <op,
+    outs, ins64, opName#asm64, pat64, opName, revOp, HasMods
+  >;
+}
+
+multiclass VOP2bInst <vop2 op, string opName, VOPProfile P,
+                      SDPatternOperator node = null_frag,
+                      string revOp = opName> : VOP2b_Helper <
+  op, opName, P.Outs,
+  P.Ins32, P.Asm32, [],
+  P.Ins64, P.Asm64,
+  !if(P.HasModifiers,
+      [(set P.DstVT:$dst,
+           (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                      i1:$clamp, i32:$omod)),
+                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+  revOp, P.HasModifiers
+>;
+
+// A VOP2 instruction that is VOP3-only on VI.
+multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs,
+                            dag ins32, string asm32, list<dag> pat32,
+                            dag ins64, string asm64, list<dag> pat64,
+                            string revOp, bit HasMods> {
+  defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+
+  defm _e64 : VOP3_2_m <op, outs, ins64, opName#asm64, pat64, opName,
+                        revOp, HasMods>;
+}
+
+multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P,
+                          SDPatternOperator node = null_frag,
+                          string revOp = opName>
+                          : VOP2_VI3_Helper <
+  op, opName, P.Outs,
+  P.Ins32, P.Asm32, [],
+  P.Ins64, P.Asm64,
+  !if(P.HasModifiers,
+      [(set P.DstVT:$dst,
+           (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                      i1:$clamp, i32:$omod)),
+                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+  revOp, P.HasModifiers
+>;
+
+multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> {
+
+  def "" : VOP2_Pseudo <VOP_MADK.Outs, VOP_MADK.Ins, pattern, opName>;
+
+let isCodeGenOnly = 0 in {
+  def _si : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins,
+                        !strconcat(opName, VOP_MADK.Asm), []>,
+            SIMCInstr <opName#"_e32", SISubtarget.SI>,
+            VOP2_MADKe <op.SI> {
+            let AssemblerPredicates = [isSICI];
+            }
+
+  def _vi : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins,
+                        !strconcat(opName, VOP_MADK.Asm), []>,
+            SIMCInstr <opName#"_e32", SISubtarget.VI>,
+            VOP2_MADKe <op.VI> {
+            let AssemblerPredicates = [isVI];
+            }
+} // End isCodeGenOnly = 0
+}
+
+class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+  VOPCCommon <ins, "", pattern>,
+  VOP <opName>,
+  SIMCInstr<opName#"_e32", SISubtarget.NONE>,
+  MnemonicAlias<opName#"_e32", opName> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName, bit DefExec, string revOpName = ""> {
+  def "" : VOPC_Pseudo <outs, ins, pattern, opName>;
+
+  def _si : VOPC<op.SI, ins, asm, []>,
+            SIMCInstr <opName#"_e32", SISubtarget.SI> {
+    let Defs = !if(DefExec, [EXEC], []);
+    let hasSideEffects = DefExec;
+  }
+
+  def _vi : VOPC<op.VI, ins, asm, []>,
+            SIMCInstr <opName#"_e32", SISubtarget.VI> {
+    let Defs = !if(DefExec, [EXEC], []);
+    let hasSideEffects = DefExec;
+  }
+}
+
+multiclass VOPC_Helper <vopc op, string opName,
+                        dag ins32, string asm32, list<dag> pat32,
+                        dag out64, dag ins64, string asm64, list<dag> pat64,
+                        bit HasMods, bit DefExec, string revOp> {
+  defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
+
+  defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64,
+                        opName, HasMods, DefExec, revOp>;
+}
+
+// Special case for class instructions which only have modifiers on
+// the 1st source operand.
+multiclass VOPC_Class_Helper <vopc op, string opName,
+                             dag ins32, string asm32, list<dag> pat32,
+                             dag out64, dag ins64, string asm64, list<dag> pat64,
+                             bit HasMods, bit DefExec, string revOp> {
+  defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
+
+  defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64,
+                        opName, HasMods, DefExec, revOp>,
+                        VOP3DisableModFields<1, 0, 0>;
+}
+
+multiclass VOPCInst <vopc op, string opName,
+                     VOPProfile P, PatLeaf cond = COND_NULL,
+                     string revOp = opName,
+                     bit DefExec = 0> : VOPC_Helper <
+  op, opName,
+  P.Ins32, P.Asm32, [],
+  (outs VOPDstS64:$dst), P.Ins64, P.Asm64,
+  !if(P.HasModifiers,
+      [(set i1:$dst,
+          (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                      i1:$clamp, i32:$omod)),
+                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+                 cond))],
+      [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]),
+  P.HasModifiers, DefExec, revOp
+>;
+
+multiclass VOPCClassInst <vopc op, string opName, VOPProfile P,
+                     bit DefExec = 0> : VOPC_Class_Helper <
+  op, opName,
+  P.Ins32, P.Asm32, [],
+  (outs VOPDstS64:$dst), P.Ins64, P.Asm64,
+  !if(P.HasModifiers,
+      [(set i1:$dst,
+          (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))],
+      [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
+  P.HasModifiers, DefExec, opName
+>;
+
+
+multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+  VOPCInst <op, opName, VOP_F32_F32_F32, cond, revOp>;
+
+multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+  VOPCInst <op, opName, VOP_F64_F64_F64, cond, revOp>;
+
+multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+  VOPCInst <op, opName, VOP_I32_I32_I32, cond, revOp>;
+
+multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+  VOPCInst <op, opName, VOP_I64_I64_I64, cond, revOp>;
+
+
+multiclass VOPCX <vopc op, string opName, VOPProfile P,
+                  PatLeaf cond = COND_NULL,
+                  string revOp = "">
+  : VOPCInst <op, opName, P, cond, revOp, 1>;
+
+multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> :
+  VOPCX <op, opName, VOP_F32_F32_F32, COND_NULL, revOp>;
+
+multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> :
+  VOPCX <op, opName, VOP_F64_F64_F64, COND_NULL, revOp>;
+
+multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> :
+  VOPCX <op, opName, VOP_I32_I32_I32, COND_NULL, revOp>;
+
+multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> :
+  VOPCX <op, opName, VOP_I64_I64_I64, COND_NULL, revOp>;
+
+multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
+                        list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m <
+    op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods
+>;
+
+multiclass VOPC_CLASS_F32 <vopc op, string opName> :
+  VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>;
+
+multiclass VOPCX_CLASS_F32 <vopc op, string opName> :
+  VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>;
+
+multiclass VOPC_CLASS_F64 <vopc op, string opName> :
+  VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>;
+
+multiclass VOPCX_CLASS_F64 <vopc op, string opName> :
+  VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>;
+
+multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
+                     SDPatternOperator node = null_frag> : VOP3_Helper <
+  op, opName, (outs P.DstRC.RegClass:$dst), P.Ins64, P.Asm64,
+  !if(!eq(P.NumSrcArgs, 3),
+    !if(P.HasModifiers,
+        [(set P.DstVT:$dst,
+            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                       i1:$clamp, i32:$omod)),
+                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+                  (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))],
+        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1,
+                                  P.Src2VT:$src2))]),
+  !if(!eq(P.NumSrcArgs, 2),
+    !if(P.HasModifiers,
+        [(set P.DstVT:$dst,
+            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                       i1:$clamp, i32:$omod)),
+                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))])
+  /* P.NumSrcArgs == 1 */,
+    !if(P.HasModifiers,
+        [(set P.DstVT:$dst,
+            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                       i1:$clamp, i32:$omod))))],
+        [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))),
+  P.NumSrcArgs, P.HasModifiers
+>;
+
+// Special case for v_div_fmas_{f32|f64}, since it seems to be the
+// only VOP instruction that implicitly reads VCC.
+multiclass VOP3_VCC_Inst <vop3 op, string opName,
+                          VOPProfile P,
+                          SDPatternOperator node = null_frag> : VOP3_Helper <
+  op, opName,
+  (outs P.DstRC.RegClass:$dst),
+  (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0,
+       InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1,
+       InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2,
+       ClampMod:$clamp,
+       omod:$omod),
+  " $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod",
+  [(set P.DstVT:$dst,
+            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                       i1:$clamp, i32:$omod)),
+                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+                  (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)),
+                  (i1 VCC)))],
+  3, 1
+>;
+
+multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc,
+                    string opName, list<dag> pattern> :
+  VOP3b_3_m <
+  op, (outs vrc:$vdst, SReg_64:$sdst),
+      (ins InputModsNoDefault:$src0_modifiers, arc:$src0,
+           InputModsNoDefault:$src1_modifiers, arc:$src1,
+           InputModsNoDefault:$src2_modifiers, arc:$src2,
+           ClampMod:$clamp, omod:$omod),
+  opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern,
+  opName, opName, 1, 1
+>;
+
+multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> :
+  VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>;
+
+multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> :
+  VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>;
+
+
+class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
+  (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+        (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+        (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))),
+  (Inst i32:$src0_modifiers, P.Src0VT:$src0,
+        i32:$src1_modifiers, P.Src1VT:$src1,
+        i32:$src2_modifiers, P.Src2VT:$src2,
+        i1:$clamp,
+        i32:$omod)>;
+
+//===----------------------------------------------------------------------===//
+// Interpolation opcodes
+//===----------------------------------------------------------------------===//
+
+class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  VINTRPCommon <outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins,
+                      string asm> :
+  VINTRPCommon <outs, ins, asm, []>,
+  VINTRPe <op>,
+  SIMCInstr<opName, SISubtarget.SI>;
+
+class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
+                      string asm> :
+  VINTRPCommon <outs, ins, asm, []>,
+  VINTRPe_vi <op>,
+  SIMCInstr<opName, SISubtarget.VI>;
+
+multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm,
+                     list<dag> pattern = []> {
+  def "" : VINTRP_Pseudo <NAME, outs, ins, pattern>;
+
+  def _si : VINTRP_Real_si <op, NAME, outs, ins, asm>;
+
+  def _vi : VINTRP_Real_vi <op, NAME, outs, ins, asm>;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector I/O classes
+//===----------------------------------------------------------------------===//
+
+class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  DS <outs, ins, "", pattern>,
+  SIMCInstr <opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
+  DS <outs, ins, asm, []>,
+  DSe <op>,
+  SIMCInstr <opName, SISubtarget.SI> {
+  let isCodeGenOnly = 0;
+}
+
+class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
+  DS <outs, ins, asm, []>,
+  DSe_vi <op>,
+  SIMCInstr <opName, SISubtarget.VI>;
+
+class DS_Off16_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
+  DS_Real_si <op,opName, outs, ins, asm> {
+
+  // Single load interpret the 2 i8imm operands as a single i16 offset.
+  bits<16> offset;
+  let offset0 = offset{7-0};
+  let offset1 = offset{15-8};
+  let isCodeGenOnly = 0;
+}
+
+class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
+  DS_Real_vi <op, opName, outs, ins, asm> {
+
+  // Single load interpret the 2 i8imm operands as a single i16 offset.
+  bits<16> offset;
+  let offset0 = offset{7-0};
+  let offset1 = offset{15-8};
+}
+
+multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc,
+  dag outs = (outs rc:$vdst),
+  dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds),
+  string asm = opName#" $vdst, $addr"#"$offset$gds"> {
+
+  def "" : DS_Pseudo <opName, outs, ins, []>;
+
+  let data0 = 0, data1 = 0 in {
+    def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+    def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+  }
+}
+
+multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc,
+  dag outs = (outs rc:$vdst),
+  dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1,
+                 gds01:$gds),
+  string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> {
+
+  def "" : DS_Pseudo <opName, outs, ins, []>;
+
+  let data0 = 0, data1 = 0, AsmMatchConverter = "cvtDSOffset01" in {
+    def _si : DS_Real_si <op, opName, outs, ins, asm>;
+    def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
+  }
+}
+
+multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc,
+  dag outs = (outs),
+  dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
+  string asm = opName#" $addr, $data0"#"$offset$gds"> {
+
+  def "" : DS_Pseudo <opName, outs, ins, []>,
+           AtomicNoRet<opName, 0>;
+
+  let data1 = 0, vdst = 0 in {
+    def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+    def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+  }
+}
+
+multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc,
+  dag outs = (outs),
+  dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
+              ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds),
+  string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> {
+
+  def "" : DS_Pseudo <opName, outs, ins, []>;
+
+  let vdst = 0, AsmMatchConverter = "cvtDSOffset01" in {
+    def _si : DS_Real_si <op, opName, outs, ins, asm>;
+    def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
+  }
+}
+
+multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc,
+                        string noRetOp = "",
+  dag outs = (outs rc:$vdst),
+  dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
+  string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> {
+
+  def "" : DS_Pseudo <opName, outs, ins, []>,
+           AtomicNoRet<noRetOp, 1>;
+
+  let data1 = 0 in {
+    def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+    def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+  }
+}
+
+multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc,
+                          string noRetOp = "", dag ins,
+  dag outs = (outs rc:$vdst),
+  string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> {
+
+  def "" : DS_Pseudo <opName, outs, ins, []>,
+           AtomicNoRet<noRetOp, 1>;
+
+  def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+  def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+}
+
+multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc,
+                        string noRetOp = "", RegisterClass src = rc> :
+  DS_1A2D_RET_m <op, asm, rc, noRetOp,
+                 (ins VGPR_32:$addr, src:$data0, src:$data1,
+                      ds_offset:$offset, gds:$gds)
+>;
+
+multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc,
+                          string noRetOp = opName,
+  dag outs = (outs),
+  dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
+                 ds_offset:$offset, gds:$gds),
+  string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> {
+
+  def "" : DS_Pseudo <opName, outs, ins, []>,
+           AtomicNoRet<noRetOp, 0>;
+
+  let vdst = 0 in {
+    def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+    def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+  }
+}
+
+multiclass DS_0A_RET <bits<8> op, string opName,
+  dag outs = (outs VGPR_32:$vdst),
+  dag ins = (ins ds_offset:$offset, gds:$gds),
+  string asm = opName#" $vdst"#"$offset"#"$gds"> {
+
+  let mayLoad = 1, mayStore = 1 in {
+    def "" : DS_Pseudo <opName, outs, ins, []>;
+
+    let addr = 0, data0 = 0, data1 = 0 in {
+      def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+      def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+    } // end addr = 0, data0 = 0, data1 = 0
+  } // end mayLoad = 1, mayStore = 1
+}
+
+multiclass DS_1A_RET_GDS <bits<8> op, string opName,
+  dag outs = (outs VGPR_32:$vdst),
+  dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset),
+  string asm = opName#" $vdst, $addr"#"$offset gds"> {
+
+  def "" : DS_Pseudo <opName, outs, ins, []>;
+
+  let data0 = 0, data1 = 0, gds = 1 in {
+    def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+    def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+  } // end data0 = 0, data1 = 0, gds = 1
+}
+
+multiclass DS_1A_GDS <bits<8> op, string opName,
+  dag outs = (outs),
+  dag ins = (ins VGPR_32:$addr),
+  string asm = opName#" $addr gds"> {
+
+  def "" : DS_Pseudo <opName, outs, ins, []>;
+
+  let vdst = 0, data0 = 0, data1 = 0, offset0 = 0, offset1 = 0, gds = 1 in {
+    def _si : DS_Real_si <op, opName, outs, ins, asm>;
+    def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
+  } // end vdst = 0, data = 0, data1 = 0, gds = 1
+}
+
+multiclass DS_1A <bits<8> op, string opName,
+  dag outs = (outs),
+  dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds),
+  string asm = opName#" $addr"#"$offset"#"$gds"> {
+
+  let mayLoad = 1, mayStore = 1 in {
+    def "" : DS_Pseudo <opName, outs, ins, []>;
+
+    let vdst = 0, data0 = 0, data1 = 0 in {
+      def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+      def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+    } // let vdst = 0, data0 = 0, data1 = 0
+  } // end mayLoad = 1, mayStore = 1
+}
+
+//===----------------------------------------------------------------------===//
+// MTBUF classes
+//===----------------------------------------------------------------------===//
+
+class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  MTBUF <outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins,
+                    string asm> :
+  MTBUF <outs, ins, asm, []>,
+  MTBUFe <op>,
+  SIMCInstr<opName, SISubtarget.SI>;
+
+class MTBUF_Real_vi <bits<4> op, string opName, dag outs, dag ins, string asm> :
+  MTBUF <outs, ins, asm, []>,
+  MTBUFe_vi <op>,
+  SIMCInstr <opName, SISubtarget.VI>;
+
+multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm,
+                    list<dag> pattern> {
+
+  def "" : MTBUF_Pseudo <opName, outs, ins, pattern>;
+
+  def _si : MTBUF_Real_si <op, opName, outs, ins, asm>;
+
+  def _vi : MTBUF_Real_vi <{0, op{2}, op{1}, op{0}}, opName, outs, ins, asm>;
+
+}
+
+let mayStore = 1, mayLoad = 0 in {
+
+multiclass MTBUF_Store_Helper <bits<3> op, string opName,
+                               RegisterClass regClass> : MTBUF_m <
+  op, opName, (outs),
+  (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
+   i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
+   SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset),
+  opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
+        #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []
+>;
+
+} // mayStore = 1, mayLoad = 0
+
+let mayLoad = 1, mayStore = 0 in {
+
+multiclass MTBUF_Load_Helper <bits<3> op, string opName,
+                              RegisterClass regClass> : MTBUF_m <
+  op, opName, (outs regClass:$dst),
+  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+       i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
+       i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset),
+  opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
+        #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []
+>;
+
+} // mayLoad = 1, mayStore = 0
+
+//===----------------------------------------------------------------------===//
+// MUBUF classes
+//===----------------------------------------------------------------------===//
+
+class mubuf <bits<7> si, bits<7> vi = si> {
+  field bits<7> SI = si;
+  field bits<7> VI = vi;
+}
+
+let isCodeGenOnly = 0 in {
+
+class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+  MUBUF <outs, ins, asm, pattern>, MUBUFe <op> {
+  let lds  = 0;
+}
+
+} // End let isCodeGenOnly = 0
+
+class MUBUF_vi <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+  MUBUF <outs, ins, asm, pattern>, MUBUFe_vi <op> {
+  let lds = 0;
+}
+
+class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
+  bit IsAddr64 = is_addr64;
+  string OpName = NAME # suffix;
+}
+
+class MUBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  MUBUF <outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+
+  // dummy fields, so that we can use let statements around multiclasses
+  bits<1> offen;
+  bits<1> idxen;
+  bits<8> vaddr;
+  bits<1> glc;
+  bits<1> slc;
+  bits<1> tfe;
+  bits<8> soffset;
+}
+
+class MUBUF_Real_si <mubuf op, string opName, dag outs, dag ins,
+                     string asm> :
+  MUBUF <outs, ins, asm, []>,
+  MUBUFe <op.SI>,
+  SIMCInstr<opName, SISubtarget.SI> {
+  let lds = 0;
+}
+
+class MUBUF_Real_vi <mubuf op, string opName, dag outs, dag ins,
+                     string asm> :
+  MUBUF <outs, ins, asm, []>,
+  MUBUFe_vi <op.VI>,
+  SIMCInstr<opName, SISubtarget.VI> {
+  let lds = 0;
+}
+
+multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm,
+                    list<dag> pattern> {
+
+  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
+           MUBUFAddr64Table <0>;
+
+  let addr64 = 0, isCodeGenOnly = 0 in {
+    def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
+  }
+
+  def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>;
+}
+
+multiclass MUBUFAddr64_m <mubuf op, string opName, dag outs,
+                          dag ins, string asm, list<dag> pattern> {
+
+  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
+           MUBUFAddr64Table <1>;
+
+  let addr64 = 1, isCodeGenOnly = 0 in {
+    def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
+  }
+
+  // There is no VI version. If the pseudo is selected, it should be lowered
+  // for VI appropriately.
+}
+
+multiclass MUBUFAtomicOffset_m <mubuf op, string opName, dag outs, dag ins,
+                                string asm, list<dag> pattern, bit is_return> {
+
+  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
+           MUBUFAddr64Table <0, !if(is_return, "_RTN", "")>,
+           AtomicNoRet<NAME#"_OFFSET", is_return>;
+
+  let offen = 0, idxen = 0, tfe = 0, vaddr = 0 in {
+    let addr64 = 0 in {
+      def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
+    }
+
+    def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>;
+  }
+}
+
+multiclass MUBUFAtomicAddr64_m <mubuf op, string opName, dag outs, dag ins,
+                                string asm, list<dag> pattern, bit is_return> {
+
+  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
+           MUBUFAddr64Table <1, !if(is_return, "_RTN", "")>,
+           AtomicNoRet<NAME#"_ADDR64", is_return>;
+
+  let offen = 0, idxen = 0, addr64 = 1, tfe = 0 in {
+    def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
+  }
+
+  // There is no VI version. If the pseudo is selected, it should be lowered
+  // for VI appropriately.
+}
+
+multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
+                         ValueType vt, SDPatternOperator atomic> {
+
+  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in {
+
+    // No return variants
+    let glc = 0 in {
+
+      defm _ADDR64 : MUBUFAtomicAddr64_m <
+        op, name#"_addr64", (outs),
+        (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
+             SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
+        name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0
+      >;
+
+      defm _OFFSET : MUBUFAtomicOffset_m <
+        op, name#"_offset", (outs),
+        (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset,
+             slc:$slc),
+        name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0
+      >;
+    } // glc = 0
+
+    // Variant that return values
+    let glc = 1, Constraints = "$vdata = $vdata_in",
+        DisableEncoding = "$vdata_in"  in {
+
+      defm _RTN_ADDR64 : MUBUFAtomicAddr64_m <
+        op, name#"_rtn_addr64", (outs rc:$vdata),
+        (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr,
+             SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
+        name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc",
+        [(set vt:$vdata,
+         (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+	                            i16:$offset, i1:$slc), vt:$vdata_in))], 1
+      >;
+
+      defm _RTN_OFFSET : MUBUFAtomicOffset_m <
+        op, name#"_rtn_offset", (outs rc:$vdata),
+        (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset,
+             mbuf_offset:$offset, slc:$slc),
+        name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc",
+        [(set vt:$vdata,
+         (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
+                                    i1:$slc), vt:$vdata_in))], 1
+      >;
+
+    } // glc = 1
+
+  } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1
+}
+
+multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
+                              ValueType load_vt = i32,
+                              SDPatternOperator ld = null_frag> {
+
+  let mayLoad = 1, mayStore = 0 in {
+    let offen = 0, idxen = 0, vaddr = 0 in {
+      defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata),
+                           (ins SReg_128:$srsrc, SCSrc_32:$soffset,
+                           mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+                           name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+                           [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc,
+                                                     i32:$soffset, i16:$offset,
+                                                     i1:$glc, i1:$slc, i1:$tfe)))]>;
+    }
+
+    let offen = 1, idxen = 0  in {
+      defm _OFFEN  : MUBUF_m <op, name#"_offen", (outs regClass:$vdata),
+                           (ins VGPR_32:$vaddr, SReg_128:$srsrc,
+                           SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc,
+                           tfe:$tfe),
+                           name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+    }
+
+    let offen = 0, idxen = 1 in {
+      defm _IDXEN  : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata),
+                           (ins VGPR_32:$vaddr, SReg_128:$srsrc,
+                           SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc,
+                           slc:$slc, tfe:$tfe),
+                           name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+    }
+
+    let offen = 1, idxen = 1 in {
+      defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata),
+                           (ins VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                           mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+                           name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+    }
+
+    let offen = 0, idxen = 0 in {
+      defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata),
+                           (ins VReg_64:$vaddr, SReg_128:$srsrc,
+                                SCSrc_32:$soffset, mbuf_offset:$offset,
+				glc:$glc, slc:$slc, tfe:$tfe),
+                           name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#
+                                "$glc"#"$slc"#"$tfe",
+                           [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc,
+                                                  i64:$vaddr, i32:$soffset,
+                                                  i16:$offset, i1:$glc, i1:$slc,
+						  i1:$tfe)))]>;
+    }
+  }
+}
+
+multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass,
+                          ValueType store_vt = i32, SDPatternOperator st = null_frag> {
+  let mayLoad = 0, mayStore = 1 in {
+    defm : MUBUF_m <op, name, (outs),
+                    (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                    mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc,
+                    tfe:$tfe),
+                    name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"#
+                         "$glc"#"$slc"#"$tfe", []>;
+
+    let offen = 0, idxen = 0, vaddr = 0 in {
+      defm _OFFSET : MUBUF_m <op, name#"_offset",(outs),
+                              (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset,
+                              mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+                              name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+                              [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+                                   i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>;
+    } // offen = 0, idxen = 0, vaddr = 0
+
+    let offen = 1, idxen = 0  in {
+      defm _OFFEN : MUBUF_m <op, name#"_offen", (outs),
+                             (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc,
+                              SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc,
+                              slc:$slc, tfe:$tfe),
+                             name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#
+                             "$glc"#"$slc"#"$tfe", []>;
+    } // end offen = 1, idxen = 0
+
+    let offen = 0, idxen = 1 in {
+      defm _IDXEN  : MUBUF_m <op, name#"_idxen", (outs),
+                           (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc,
+                           SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc,
+                           slc:$slc, tfe:$tfe),
+                           name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+    }
+
+    let offen = 1, idxen = 1 in {
+      defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs),
+                           (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
+                           mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+                           name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+    }
+
+    let offen = 0, idxen = 0 in {
+      defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs),
+                                    (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc,
+                                         SCSrc_32:$soffset,
+                                         mbuf_offset:$offset, glc:$glc, slc:$slc,
+                                         tfe:$tfe),
+                                    name#" $vdata, $vaddr, $srsrc, $soffset addr64"#
+                                         "$offset"#"$glc"#"$slc"#"$tfe",
+                                    [(st store_vt:$vdata,
+                                      (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr,
+                                                   i32:$soffset, i16:$offset,
+                                                   i1:$glc, i1:$slc, i1:$tfe))]>;
+    }
+  } // End mayLoad = 0, mayStore = 1
+}
+
+class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
+      FLAT <op, (outs regClass:$vdst),
+                (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
+            asm#" $vdst, $addr"#"$glc"#"$slc"#"$tfe", []> {
+  let data = 0;
+  let mayLoad = 1;
+}
+
+class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
+      FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr,
+                             glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
+          name#" $data, $addr"#"$glc"#"$slc"#"$tfe",
+         []> {
+
+  let mayLoad = 0;
+  let mayStore = 1;
+
+  // Encoding
+  let vdst = 0;
+}
+
+multiclass FLAT_ATOMIC <bits<7> op, string name, RegisterClass vdst_rc,
+                        RegisterClass data_rc = vdst_rc> {
+
+  let mayLoad = 1, mayStore = 1 in {
+    def "" : FLAT <op, (outs),
+                  (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
+                       tfe_flat_atomic:$tfe),
+                   name#" $addr, $data"#"$slc"#"$tfe", []>,
+             AtomicNoRet <NAME, 0> {
+      let glc = 0;
+      let vdst = 0;
+    }
+
+    def _RTN : FLAT <op, (outs vdst_rc:$vdst),
+                     (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
+                          tfe_flat_atomic:$tfe),
+                     name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>,
+               AtomicNoRet <NAME, 1> {
+      let glc = 1;
+    }
+  }
+}
+
+class MIMG_Mask <string op, int channels> {
+  string Op = op;
+  int Channels = channels;
+}
+
+class MIMG_NoSampler_Helper <bits<7> op, string asm,
+                             RegisterClass dst_rc,
+                             RegisterClass src_rc> : MIMG <
+  op,
+  (outs dst_rc:$vdata),
+  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
+       SReg_256:$srsrc),
+  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
+     #" $tfe, $lwe, $slc, $vaddr, $srsrc",
+  []> {
+  let ssamp = 0;
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasPostISelHook = 1;
+}
+
+multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
+                                      RegisterClass dst_rc,
+                                      int channels> {
+  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+}
+
+multiclass MIMG_NoSampler <bits<7> op, string asm> {
+  defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
+  defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
+}
+
+class MIMG_Sampler_Helper <bits<7> op, string asm,
+                           RegisterClass dst_rc,
+                           RegisterClass src_rc, int wqm> : MIMG <
+  op,
+  (outs dst_rc:$vdata),
+  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
+       SReg_256:$srsrc, SReg_128:$ssamp),
+  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
+     #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasPostISelHook = 1;
+  let WQM = wqm;
+}
+
+multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
+                                    RegisterClass dst_rc,
+                                    int channels, int wqm> {
+  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>,
+            MIMG_Mask<asm#"_V4", channels>;
+  def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>,
+            MIMG_Mask<asm#"_V8", channels>;
+  def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>,
+            MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Sampler <bits<7> op, string asm> {
+  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 0>;
+  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 0>;
+  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 0>;
+  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 0>;
+}
+
+multiclass MIMG_Sampler_WQM <bits<7> op, string asm> {
+  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 1>;
+  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 1>;
+  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 1>;
+  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 1>;
+}
+
+class MIMG_Gather_Helper <bits<7> op, string asm,
+                          RegisterClass dst_rc,
+                          RegisterClass src_rc, int wqm> : MIMG <
+  op,
+  (outs dst_rc:$vdata),
+  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
+       SReg_256:$srsrc, SReg_128:$ssamp),
+  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
+     #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+
+  // DMASK was repurposed for GATHER4. 4 components are always
+  // returned and DMASK works like a swizzle - it selects
+  // the component to fetch. The only useful DMASK values are
+  // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
+  // (red,red,red,red) etc.) The ISA document doesn't mention
+  // this.
+  // Therefore, disable all code which updates DMASK by setting these two:
+  let MIMG = 0;
+  let hasPostISelHook = 0;
+  let WQM = wqm;
+}
+
+multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
+                                    RegisterClass dst_rc,
+                                    int channels, int wqm> {
+  def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>,
+            MIMG_Mask<asm#"_V4", channels>;
+  def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>,
+            MIMG_Mask<asm#"_V8", channels>;
+  def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>,
+            MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Gather <bits<7> op, string asm> {
+  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 0>;
+  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 0>;
+  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 0>;
+  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 0>;
+}
+
+multiclass MIMG_Gather_WQM <bits<7> op, string asm> {
+  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 1>;
+  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 1>;
+  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 1>;
+  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 1>;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector instruction mappings
+//===----------------------------------------------------------------------===//
+
+// Maps an opcode in e32 form to its e64 equivalent
+def getVOPe64 : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["Size"];
+  let KeyCol = ["4"];
+  let ValueCols = [["8"]];
+}
+
+// Maps an opcode in e64 form to its e32 equivalent
+def getVOPe32 : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["Size"];
+  let KeyCol = ["8"];
+  let ValueCols = [["4"]];
+}
+
+def getMaskedMIMGOp : InstrMapping {
+  let FilterClass = "MIMG_Mask";
+  let RowFields = ["Op"];
+  let ColFields = ["Channels"];
+  let KeyCol = ["4"];
+  let ValueCols = [["1"], ["2"], ["3"] ];
+}
+
+// Maps an commuted opcode to its original version
+def getCommuteOrig : InstrMapping {
+  let FilterClass = "VOP2_REV";
+  let RowFields = ["RevOp"];
+  let ColFields = ["IsOrig"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+// Maps an original opcode to its commuted version
+def getCommuteRev : InstrMapping {
+  let FilterClass = "VOP2_REV";
+  let RowFields = ["RevOp"];
+  let ColFields = ["IsOrig"];
+  let KeyCol = ["1"];
+  let ValueCols = [["0"]];
+}
+
+def getCommuteCmpOrig : InstrMapping {
+  let FilterClass = "VOP2_REV";
+  let RowFields = ["RevOp"];
+  let ColFields = ["IsOrig"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+// Maps an original opcode to its commuted version
+def getCommuteCmpRev : InstrMapping {
+  let FilterClass = "VOP2_REV";
+  let RowFields = ["RevOp"];
+  let ColFields = ["IsOrig"];
+  let KeyCol = ["1"];
+  let ValueCols = [["0"]];
+}
+
+
+def getMCOpcodeGen : InstrMapping {
+  let FilterClass = "SIMCInstr";
+  let RowFields = ["PseudoInstr"];
+  let ColFields = ["Subtarget"];
+  let KeyCol = [!cast<string>(SISubtarget.NONE)];
+  let ValueCols = [[!cast<string>(SISubtarget.SI)],[!cast<string>(SISubtarget.VI)]];
+}
+
+def getAddr64Inst : InstrMapping {
+  let FilterClass = "MUBUFAddr64Table";
+  let RowFields = ["OpName"];
+  let ColFields = ["IsAddr64"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+// Maps an atomic opcode to its version with a return value.
+def getAtomicRetOp : InstrMapping {
+  let FilterClass = "AtomicNoRet";
+  let RowFields = ["NoRetOp"];
+  let ColFields = ["IsRet"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+// Maps an atomic opcode to its returnless version.
+def getAtomicNoRetOp : InstrMapping {
+  let FilterClass = "AtomicNoRet";
+  let RowFields = ["NoRetOp"];
+  let ColFields = ["IsRet"];
+  let KeyCol = ["1"];
+  let ValueCols = [["0"]];
+}
+
+include "SIInstructions.td"
+include "CIInstructions.td"
+include "VIInstructions.td"
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
new file mode 100644
index 00000000000..8c8d836776d
--- /dev/null
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -0,0 +1,3327 @@
+//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file was originally auto-generated from a GPU register header file and
+// all the instruction definitions were originally commented out.  Instructions
+// that are not yet supported remain commented out.
+//===----------------------------------------------------------------------===//
+
+class InterpSlots {
+int P0 = 2;
+int P10 = 0;
+int P20 = 1;
+}
+def INTERP : InterpSlots;
+
+def InterpSlot : Operand<i32> {
+  let PrintMethod = "printInterpSlot";
+}
+
+def SendMsgImm : Operand<i32> {
+  let PrintMethod = "printSendMsg";
+}
+
+def isGCN : Predicate<"Subtarget->getGeneration() "
+                      ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">,
+            AssemblerPredicate<"FeatureGCN">;
+def isSI : Predicate<"Subtarget->getGeneration() "
+                      "== AMDGPUSubtarget::SOUTHERN_ISLANDS">;
+
+def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
+def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
+
+def SWaitMatchClass : AsmOperandClass {
+  let Name = "SWaitCnt";
+  let RenderMethod = "addImmOperands";
+  let ParserMethod = "parseSWaitCntOps";
+}
+
+def WAIT_FLAG : InstFlag<"printWaitFlag"> {
+  let ParserMatchClass = SWaitMatchClass;
+}
+
+let SubtargetPredicate = isGCN in {
+
+//===----------------------------------------------------------------------===//
+// EXP Instructions
+//===----------------------------------------------------------------------===//
+
+defm EXP : EXP_m;
+
+//===----------------------------------------------------------------------===//
+// SMRD Instructions
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1 in {
+
+// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
+// SMRD instructions, because the SGPR_32 register class does not include M0
+// and writing to M0 from an SMRD instruction will hang the GPU.
+defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>;
+
+defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
+  0x08, "s_buffer_load_dword", SReg_128, SGPR_32
+>;
+
+defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
+  0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64
+>;
+
+defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
+  0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128
+>;
+
+defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
+  0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256
+>;
+
+defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
+  0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512
+>;
+
+} // mayLoad = 1
+
+//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>;
+//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+let isMoveImm = 1 in {
+  let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+    defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>;
+    defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>;
+  } // let isRematerializeable = 1
+
+  let Uses = [SCC] in {
+    defm S_CMOV_B32 : SOP1_32 <sop1<0x05, 0x02>, "s_cmov_b32", []>;
+    defm S_CMOV_B64 : SOP1_64 <sop1<0x06, 0x03>, "s_cmov_b64", []>;
+  } // End Uses = [SCC]
+} // End isMoveImm = 1
+
+let Defs = [SCC] in {
+  defm S_NOT_B32 : SOP1_32 <sop1<0x07, 0x04>, "s_not_b32",
+    [(set i32:$dst, (not i32:$src0))]
+  >;
+
+  defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64",
+    [(set i64:$dst, (not i64:$src0))]
+  >;
+  defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>;
+  defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>;
+} // End Defs = [SCC]
+
+
+defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",
+  [(set i32:$dst, (AMDGPUbrev i32:$src0))]
+>;
+defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;
+
+let Defs = [SCC] in {
+  defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>;
+  defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>;
+  defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32",
+    [(set i32:$dst, (ctpop i32:$src0))]
+  >;
+  defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>;
+} // End Defs = [SCC]
+
+defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>;
+defm S_FF0_I32_B64 : SOP1_32_64 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>;
+defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32",
+  [(set i32:$dst, (cttz_zero_undef i32:$src0))]
+>;
+defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>;
+
+defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32",
+  [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
+>;
+
+defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>;
+defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32",
+  [(set i32:$dst, (int_AMDGPU_flbit_i32 i32:$src0))]
+>;
+defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>;
+defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8",
+  [(set i32:$dst, (sext_inreg i32:$src0, i8))]
+>;
+defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16",
+  [(set i32:$dst, (sext_inreg i32:$src0, i16))]
+>;
+
+defm S_BITSET0_B32 : SOP1_32 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>;
+defm S_BITSET0_B64 : SOP1_64 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>;
+defm S_BITSET1_B32 : SOP1_32 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>;
+defm S_BITSET1_B64 : SOP1_64 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>;
+defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>;
+defm S_SETPC_B64 : SOP1_64 <sop1<0x20, 0x1d>, "s_setpc_b64", []>;
+defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>;
+defm S_RFE_B64 : SOP1_64 <sop1<0x22, 0x1f>, "s_rfe_b64", []>;
+
+let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
+
+defm S_AND_SAVEEXEC_B64 : SOP1_64 <sop1<0x24, 0x20>, "s_and_saveexec_b64", []>;
+defm S_OR_SAVEEXEC_B64 : SOP1_64 <sop1<0x25, 0x21>, "s_or_saveexec_b64", []>;
+defm S_XOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x26, 0x22>, "s_xor_saveexec_b64", []>;
+defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x27, 0x23>, "s_andn2_saveexec_b64", []>;
+defm S_ORN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x28, 0x24>, "s_orn2_saveexec_b64", []>;
+defm S_NAND_SAVEEXEC_B64 : SOP1_64 <sop1<0x29, 0x25>, "s_nand_saveexec_b64", []>;
+defm S_NOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2a, 0x26>, "s_nor_saveexec_b64", []>;
+defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>;
+
+} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
+
+defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>;
+defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>;
+defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>;
+defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>;
+defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>;
+defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>;
+defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>;
+defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>;
+let Defs = [SCC] in {
+  defm S_ABS_I32 : SOP1_32 <sop1<0x34, 0x30>, "s_abs_i32", []>;
+} // End Defs = [SCC]
+defm S_MOV_FED_B32 : SOP1_32 <sop1<0x35, 0x31>, "s_mov_fed_b32", []>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [SCC] in { // Carry out goes to SCC
+let isCommutable = 1 in {
+defm S_ADD_U32 : SOP2_32 <sop2<0x00>, "s_add_u32", []>;
+defm S_ADD_I32 : SOP2_32 <sop2<0x02>, "s_add_i32",
+  [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
+>;
+} // End isCommutable = 1
+
+defm S_SUB_U32 : SOP2_32 <sop2<0x01>, "s_sub_u32", []>;
+defm S_SUB_I32 : SOP2_32 <sop2<0x03>, "s_sub_i32",
+  [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
+>;
+
+let Uses = [SCC] in { // Carry in comes from SCC
+let isCommutable = 1 in {
+defm S_ADDC_U32 : SOP2_32 <sop2<0x04>, "s_addc_u32",
+  [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End isCommutable = 1
+
+defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32",
+  [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End Uses = [SCC]
+
+defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32",
+  [(set i32:$dst, (smin i32:$src0, i32:$src1))]
+>;
+defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32",
+  [(set i32:$dst, (umin i32:$src0, i32:$src1))]
+>;
+defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32",
+  [(set i32:$dst, (smax i32:$src0, i32:$src1))]
+>;
+defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32",
+  [(set i32:$dst, (umax i32:$src0, i32:$src1))]
+>;
+} // End Defs = [SCC]
+
+
+let Uses = [SCC] in {
+  defm S_CSELECT_B32 : SOP2_32 <sop2<0x0a>, "s_cselect_b32", []>;
+  defm S_CSELECT_B64 : SOP2_64 <sop2<0x0b>, "s_cselect_b64", []>;
+} // End Uses = [SCC]
+
+let Defs = [SCC] in {
+defm S_AND_B32 : SOP2_32 <sop2<0x0e, 0x0c>, "s_and_b32",
+  [(set i32:$dst, (and i32:$src0, i32:$src1))]
+>;
+
+defm S_AND_B64 : SOP2_64 <sop2<0x0f, 0x0d>, "s_and_b64",
+  [(set i64:$dst, (and i64:$src0, i64:$src1))]
+>;
+
+defm S_OR_B32 : SOP2_32 <sop2<0x10, 0x0e>, "s_or_b32",
+  [(set i32:$dst, (or i32:$src0, i32:$src1))]
+>;
+
+defm S_OR_B64 : SOP2_64 <sop2<0x11, 0x0f>, "s_or_b64",
+  [(set i64:$dst, (or i64:$src0, i64:$src1))]
+>;
+
+defm S_XOR_B32 : SOP2_32 <sop2<0x12, 0x10>, "s_xor_b32",
+  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+>;
+
+defm S_XOR_B64 : SOP2_64 <sop2<0x13, 0x11>, "s_xor_b64",
+  [(set i64:$dst, (xor i64:$src0, i64:$src1))]
+>;
+defm S_ANDN2_B32 : SOP2_32 <sop2<0x14, 0x12>, "s_andn2_b32", []>;
+defm S_ANDN2_B64 : SOP2_64 <sop2<0x15, 0x13>, "s_andn2_b64", []>;
+defm S_ORN2_B32 : SOP2_32 <sop2<0x16, 0x14>, "s_orn2_b32", []>;
+defm S_ORN2_B64 : SOP2_64 <sop2<0x17, 0x15>, "s_orn2_b64", []>;
+defm S_NAND_B32 : SOP2_32 <sop2<0x18, 0x16>, "s_nand_b32", []>;
+defm S_NAND_B64 : SOP2_64 <sop2<0x19, 0x17>, "s_nand_b64", []>;
+defm S_NOR_B32 : SOP2_32 <sop2<0x1a, 0x18>, "s_nor_b32", []>;
+defm S_NOR_B64 : SOP2_64 <sop2<0x1b, 0x19>, "s_nor_b64", []>;
+defm S_XNOR_B32 : SOP2_32 <sop2<0x1c, 0x1a>, "s_xnor_b32", []>;
+defm S_XNOR_B64 : SOP2_64 <sop2<0x1d, 0x1b>, "s_xnor_b64", []>;
+} // End Defs = [SCC]
+
+// Use added complexity so these patterns are preferred to the VALU patterns.
+let AddedComplexity = 1 in {
+let Defs = [SCC] in {
+
+defm S_LSHL_B32 : SOP2_32 <sop2<0x1e, 0x1c>, "s_lshl_b32",
+  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+>;
+defm S_LSHL_B64 : SOP2_64_32 <sop2<0x1f, 0x1d>, "s_lshl_b64",
+  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+>;
+defm S_LSHR_B32 : SOP2_32 <sop2<0x20, 0x1e>, "s_lshr_b32",
+  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+>;
+defm S_LSHR_B64 : SOP2_64_32 <sop2<0x21, 0x1f>, "s_lshr_b64",
+  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+>;
+defm S_ASHR_I32 : SOP2_32 <sop2<0x22, 0x20>, "s_ashr_i32",
+  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+>;
+defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64",
+  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+>;
+} // End Defs = [SCC]
+
+defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32",
+  [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
+defm S_BFM_B64 : SOP2_64 <sop2<0x25, 0x23>, "s_bfm_b64", []>;
+defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32",
+  [(set i32:$dst, (mul i32:$src0, i32:$src1))]
+>;
+
+} // End AddedComplexity = 1
+
+let Defs = [SCC] in {
+defm S_BFE_U32 : SOP2_32 <sop2<0x27, 0x25>, "s_bfe_u32", []>;
+defm S_BFE_I32 : SOP2_32 <sop2<0x28, 0x26>, "s_bfe_i32", []>;
+defm S_BFE_U64 : SOP2_64 <sop2<0x29, 0x27>, "s_bfe_u64", []>;
+defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>;
+} // End Defs = [SCC]
+
+let sdst = 0 in {
+defm S_CBRANCH_G_FORK : SOP2_m <
+  sop2<0x2b, 0x29>, "s_cbranch_g_fork", (outs),
+  (ins SReg_64:$src0, SReg_64:$src1), "s_cbranch_g_fork $src0, $src1", []
+>;
+}
+
+let Defs = [SCC] in {
+defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>;
+} // End Defs = [SCC]
+
+//===----------------------------------------------------------------------===//
+// SOPC Instructions
+//===----------------------------------------------------------------------===//
+
+def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">;
+def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">;
+def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">;
+def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">;
+def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">;
+def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">;
+def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">;
+def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">;
+def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">;
+def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">;
+def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">;
+def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">;
+////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>;
+////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>;
+////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>;
+////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>;
+//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>;
+
+//===----------------------------------------------------------------------===//
+// SOPK Instructions
+//===----------------------------------------------------------------------===//
+
+let isReMaterializable = 1 in {
+defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>;
+} // End isReMaterializable = 1
+let Uses = [SCC] in {
+  defm S_CMOVK_I32 : SOPK_32 <sopk<0x02, 0x01>, "s_cmovk_i32", []>;
+}
+
+let isCompare = 1 in {
+
+/*
+This instruction is disabled for now until we can figure out how to teach
+the instruction selector to correctly use the  S_CMP* vs V_CMP*
+instructions.
+
+When this instruction is enabled the code generator sometimes produces this
+invalid sequence:
+
+SCC = S_CMPK_EQ_I32 SGPR0, imm
+VCC = COPY SCC
+VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
+
+defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32",
+  [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
+>;
+*/
+
+defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32", []>;
+defm S_CMPK_LG_I32 : SOPK_SCC <sopk<0x04, 0x03>, "s_cmpk_lg_i32", []>;
+defm S_CMPK_GT_I32 : SOPK_SCC <sopk<0x05, 0x04>, "s_cmpk_gt_i32", []>;
+defm S_CMPK_GE_I32 : SOPK_SCC <sopk<0x06, 0x05>, "s_cmpk_ge_i32", []>;
+defm S_CMPK_LT_I32 : SOPK_SCC <sopk<0x07, 0x06>, "s_cmpk_lt_i32", []>;
+defm S_CMPK_LE_I32 : SOPK_SCC <sopk<0x08, 0x07>, "s_cmpk_le_i32", []>;
+defm S_CMPK_EQ_U32 : SOPK_SCC <sopk<0x09, 0x08>, "s_cmpk_eq_u32", []>;
+defm S_CMPK_LG_U32 : SOPK_SCC <sopk<0x0a, 0x09>, "s_cmpk_lg_u32", []>;
+defm S_CMPK_GT_U32 : SOPK_SCC <sopk<0x0b, 0x0a>, "s_cmpk_gt_u32", []>;
+defm S_CMPK_GE_U32 : SOPK_SCC <sopk<0x0c, 0x0b>, "s_cmpk_ge_u32", []>;
+defm S_CMPK_LT_U32 : SOPK_SCC <sopk<0x0d, 0x0c>, "s_cmpk_lt_u32", []>;
+defm S_CMPK_LE_U32 : SOPK_SCC <sopk<0x0e, 0x0d>, "s_cmpk_le_u32", []>;
+} // End isCompare = 1
+
+let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0",
+    Constraints = "$sdst = $src0" in {
+  defm S_ADDK_I32 : SOPK_32TIE <sopk<0x0f, 0x0e>, "s_addk_i32", []>;
+  defm S_MULK_I32 : SOPK_32TIE <sopk<0x10, 0x0f>, "s_mulk_i32", []>;
+}
+
+defm S_CBRANCH_I_FORK : SOPK_m <
+  sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs),
+  (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16"
+>;
+defm S_GETREG_B32 : SOPK_32 <sopk<0x12, 0x11>, "s_getreg_b32", []>;
+defm S_SETREG_B32 : SOPK_m <
+  sopk<0x13, 0x12>, "s_setreg_b32", (outs),
+  (ins SReg_32:$sdst, u16imm:$simm16), " $sdst, $simm16"
+>;
+// FIXME: Not on SI?
+//defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>;
+defm S_SETREG_IMM32_B32 : SOPK_IMM32 <
+  sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs),
+  (ins i32imm:$imm, u16imm:$simm16), " $imm, $simm16"
+>;
+
+//===----------------------------------------------------------------------===//
+// SOPP Instructions
+//===----------------------------------------------------------------------===//
+
+def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
+
+let isTerminator = 1 in {
+
+def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
+  [(IL_retflag)]> {
+  let simm16 = 0;
+  let isBarrier = 1;
+  let hasCtrlDep = 1;
+}
+
+let isBranch = 1 in {
+def S_BRANCH : SOPP <
+  0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
+  [(br bb:$simm16)]> {
+  let isBarrier = 1;
+}
+
+let DisableEncoding = "$scc" in {
+def S_CBRANCH_SCC0 : SOPP <
+  0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc),
+  "s_cbranch_scc0 $simm16"
+>;
+def S_CBRANCH_SCC1 : SOPP <
+  0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc),
+  "s_cbranch_scc1 $simm16"
+>;
+} // End DisableEncoding = "$scc"
+
+def S_CBRANCH_VCCZ : SOPP <
+  0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
+  "s_cbranch_vccz $simm16"
+>;
+def S_CBRANCH_VCCNZ : SOPP <
+  0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
+  "s_cbranch_vccnz $simm16"
+>;
+
+let DisableEncoding = "$exec" in {
+def S_CBRANCH_EXECZ : SOPP <
+  0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec),
+  "s_cbranch_execz $simm16"
+>;
+def S_CBRANCH_EXECNZ : SOPP <
+  0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec),
+  "s_cbranch_execnz $simm16"
+>;
+} // End DisableEncoding = "$exec"
+
+
+} // End isBranch = 1
+} // End isTerminator = 1
+
+let hasSideEffects = 1 in {
+def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
+  [(int_AMDGPU_barrier_local)]
+> {
+  let simm16 = 0;
+  let isBarrier = 1;
+  let hasCtrlDep = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
+def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
+def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">;
+def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">;
+
+let Uses = [EXEC, M0] in {
+  def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
+      [(AMDGPUsendmsg (i32 imm:$simm16))]
+  >;
+} // End Uses = [EXEC, M0]
+
+def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">;
+def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">;
+def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
+	let simm16 = 0;
+}
+def S_INCPERFLEVEL : SOPP <0x00000014, (ins i16imm:$simm16), "s_incperflevel $simm16">;
+def S_DECPERFLEVEL : SOPP <0x00000015, (ins i16imm:$simm16), "s_decperflevel $simm16">;
+def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
+  let simm16 = 0;
+}
+} // End hasSideEffects
+
+//===----------------------------------------------------------------------===//
+// VOPC Instructions
+//===----------------------------------------------------------------------===//
+
+let isCompare = 1, isCommutable = 1 in {
+
+defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0, 0x40>, "v_cmp_f_f32">;
+defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">;
+defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2, 0x42>, "v_cmp_eq_f32", COND_OEQ>;
+defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">;
+defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4, 0x44>, "v_cmp_gt_f32", COND_OGT>;
+defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5, 0x45>, "v_cmp_lg_f32", COND_ONE>;
+defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6, 0x46>, "v_cmp_ge_f32", COND_OGE>;
+defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7, 0x47>, "v_cmp_o_f32", COND_O>;
+defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8, 0x48>, "v_cmp_u_f32", COND_UO>;
+defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32",  COND_ULT, "v_cmp_nle_f32">;
+defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa, 0x4a>, "v_cmp_nlg_f32", COND_UEQ>;
+defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">;
+defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc, 0x4c>, "v_cmp_nle_f32", COND_UGT>;
+defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd, 0x4d>, "v_cmp_neq_f32", COND_UNE>;
+defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe, 0x4e>, "v_cmp_nlt_f32", COND_UGE>;
+defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf, 0x4f>, "v_cmp_tru_f32">;
+
+
+defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10, 0x50>, "v_cmpx_f_f32">;
+defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32", "v_cmpx_gt_f32">;
+defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12, 0x52>, "v_cmpx_eq_f32">;
+defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32", "v_cmpx_ge_f32">;
+defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14, 0x54>, "v_cmpx_gt_f32">;
+defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15, 0x55>, "v_cmpx_lg_f32">;
+defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16, 0x56>, "v_cmpx_ge_f32">;
+defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17, 0x57>, "v_cmpx_o_f32">;
+defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18, 0x58>, "v_cmpx_u_f32">;
+defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19, 0x59>, "v_cmpx_nge_f32">;
+defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a, 0x5a>, "v_cmpx_nlg_f32">;
+defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b, 0x5b>, "v_cmpx_ngt_f32">;
+defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c, 0x5c>, "v_cmpx_nle_f32">;
+defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d, 0x5d>, "v_cmpx_neq_f32">;
+defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e, 0x5e>, "v_cmpx_nlt_f32">;
+defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f, 0x5f>, "v_cmpx_tru_f32">;
+
+
+defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20, 0x60>, "v_cmp_f_f64">;
+defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">;
+defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22, 0x62>, "v_cmp_eq_f64", COND_OEQ>;
+defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">;
+defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24, 0x64>, "v_cmp_gt_f64", COND_OGT>;
+defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25, 0x65>, "v_cmp_lg_f64", COND_ONE>;
+defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26, 0x66>, "v_cmp_ge_f64", COND_OGE>;
+defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27, 0x67>, "v_cmp_o_f64", COND_O>;
+defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28, 0x68>, "v_cmp_u_f64", COND_UO>;
+defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">;
+defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a, 0x6a>, "v_cmp_nlg_f64", COND_UEQ>;
+defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">;
+defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c, 0x6c>, "v_cmp_nle_f64", COND_UGT>;
+defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d, 0x6d>, "v_cmp_neq_f64", COND_UNE>;
+defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e, 0x6e>, "v_cmp_nlt_f64", COND_UGE>;
+defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f, 0x6f>, "v_cmp_tru_f64">;
+
+
+defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30, 0x70>, "v_cmpx_f_f64">;
+defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64", "v_cmpx_gt_f64">;
+defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32, 0x72>, "v_cmpx_eq_f64">;
+defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64", "v_cmpx_ge_f64">;
+defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34, 0x74>, "v_cmpx_gt_f64">;
+defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35, 0x75>, "v_cmpx_lg_f64">;
+defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36, 0x76>, "v_cmpx_ge_f64">;
+defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37, 0x77>, "v_cmpx_o_f64">;
+defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38, 0x78>, "v_cmpx_u_f64">;
+defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64", "v_cmpx_nle_f64">;
+defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a, 0x7a>, "v_cmpx_nlg_f64">;
+defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64", "v_cmpx_nlt_f64">;
+defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c, 0x7c>, "v_cmpx_nle_f64">;
+defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d, 0x7d>, "v_cmpx_neq_f64">;
+defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e, 0x7e>, "v_cmpx_nlt_f64">;
+defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f, 0x7f>, "v_cmpx_tru_f64">;
+
+
+let SubtargetPredicate = isSICI in {
+
+defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">;
+defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">;
+defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">;
+defm V_CMPS_LE_F32 : VOPC_F32 <vopc<0x43>, "v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">;
+defm V_CMPS_GT_F32 : VOPC_F32 <vopc<0x44>, "v_cmps_gt_f32">;
+defm V_CMPS_LG_F32 : VOPC_F32 <vopc<0x45>, "v_cmps_lg_f32">;
+defm V_CMPS_GE_F32 : VOPC_F32 <vopc<0x46>, "v_cmps_ge_f32">;
+defm V_CMPS_O_F32 : VOPC_F32 <vopc<0x47>, "v_cmps_o_f32">;
+defm V_CMPS_U_F32 : VOPC_F32 <vopc<0x48>, "v_cmps_u_f32">;
+defm V_CMPS_NGE_F32 : VOPC_F32 <vopc<0x49>, "v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">;
+defm V_CMPS_NLG_F32 : VOPC_F32 <vopc<0x4a>, "v_cmps_nlg_f32">;
+defm V_CMPS_NGT_F32 : VOPC_F32 <vopc<0x4b>, "v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">;
+defm V_CMPS_NLE_F32 : VOPC_F32 <vopc<0x4c>, "v_cmps_nle_f32">;
+defm V_CMPS_NEQ_F32 : VOPC_F32 <vopc<0x4d>, "v_cmps_neq_f32">;
+defm V_CMPS_NLT_F32 : VOPC_F32 <vopc<0x4e>, "v_cmps_nlt_f32">;
+defm V_CMPS_TRU_F32 : VOPC_F32 <vopc<0x4f>, "v_cmps_tru_f32">;
+
+
+defm V_CMPSX_F_F32 : VOPCX_F32 <vopc<0x50>, "v_cmpsx_f_f32">;
+defm V_CMPSX_LT_F32 : VOPCX_F32 <vopc<0x51>, "v_cmpsx_lt_f32", "v_cmpsx_gt_f32">;
+defm V_CMPSX_EQ_F32 : VOPCX_F32 <vopc<0x52>, "v_cmpsx_eq_f32">;
+defm V_CMPSX_LE_F32 : VOPCX_F32 <vopc<0x53>, "v_cmpsx_le_f32", "v_cmpsx_ge_f32">;
+defm V_CMPSX_GT_F32 : VOPCX_F32 <vopc<0x54>, "v_cmpsx_gt_f32">;
+defm V_CMPSX_LG_F32 : VOPCX_F32 <vopc<0x55>, "v_cmpsx_lg_f32">;
+defm V_CMPSX_GE_F32 : VOPCX_F32 <vopc<0x56>, "v_cmpsx_ge_f32">;
+defm V_CMPSX_O_F32 : VOPCX_F32 <vopc<0x57>, "v_cmpsx_o_f32">;
+defm V_CMPSX_U_F32 : VOPCX_F32 <vopc<0x58>, "v_cmpsx_u_f32">;
+defm V_CMPSX_NGE_F32 : VOPCX_F32 <vopc<0x59>, "v_cmpsx_nge_f32", "v_cmpsx_nle_f32">;
+defm V_CMPSX_NLG_F32 : VOPCX_F32 <vopc<0x5a>, "v_cmpsx_nlg_f32">;
+defm V_CMPSX_NGT_F32 : VOPCX_F32 <vopc<0x5b>, "v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">;
+defm V_CMPSX_NLE_F32 : VOPCX_F32 <vopc<0x5c>, "v_cmpsx_nle_f32">;
+defm V_CMPSX_NEQ_F32 : VOPCX_F32 <vopc<0x5d>, "v_cmpsx_neq_f32">;
+defm V_CMPSX_NLT_F32 : VOPCX_F32 <vopc<0x5e>, "v_cmpsx_nlt_f32">;
+defm V_CMPSX_TRU_F32 : VOPCX_F32 <vopc<0x5f>, "v_cmpsx_tru_f32">;
+
+
+defm V_CMPS_F_F64 : VOPC_F64 <vopc<0x60>, "v_cmps_f_f64">;
+defm V_CMPS_LT_F64 : VOPC_F64 <vopc<0x61>, "v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">;
+defm V_CMPS_EQ_F64 : VOPC_F64 <vopc<0x62>, "v_cmps_eq_f64">;
+defm V_CMPS_LE_F64 : VOPC_F64 <vopc<0x63>, "v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">;
+defm V_CMPS_GT_F64 : VOPC_F64 <vopc<0x64>, "v_cmps_gt_f64">;
+defm V_CMPS_LG_F64 : VOPC_F64 <vopc<0x65>, "v_cmps_lg_f64">;
+defm V_CMPS_GE_F64 : VOPC_F64 <vopc<0x66>, "v_cmps_ge_f64">;
+defm V_CMPS_O_F64 : VOPC_F64 <vopc<0x67>, "v_cmps_o_f64">;
+defm V_CMPS_U_F64 : VOPC_F64 <vopc<0x68>, "v_cmps_u_f64">;
+defm V_CMPS_NGE_F64 : VOPC_F64 <vopc<0x69>, "v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">;
+defm V_CMPS_NLG_F64 : VOPC_F64 <vopc<0x6a>, "v_cmps_nlg_f64">;
+defm V_CMPS_NGT_F64 : VOPC_F64 <vopc<0x6b>, "v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">;
+defm V_CMPS_NLE_F64 : VOPC_F64 <vopc<0x6c>, "v_cmps_nle_f64">;
+defm V_CMPS_NEQ_F64 : VOPC_F64 <vopc<0x6d>, "v_cmps_neq_f64">;
+defm V_CMPS_NLT_F64 : VOPC_F64 <vopc<0x6e>, "v_cmps_nlt_f64">;
+defm V_CMPS_TRU_F64 : VOPC_F64 <vopc<0x6f>, "v_cmps_tru_f64">;
+
+
+defm V_CMPSX_F_F64 : VOPCX_F64 <vopc<0x70>, "v_cmpsx_f_f64">;
+defm V_CMPSX_LT_F64 : VOPCX_F64 <vopc<0x71>, "v_cmpsx_lt_f64", "v_cmpsx_gt_f64">;
+defm V_CMPSX_EQ_F64 : VOPCX_F64 <vopc<0x72>, "v_cmpsx_eq_f64">;
+defm V_CMPSX_LE_F64 : VOPCX_F64 <vopc<0x73>, "v_cmpsx_le_f64", "v_cmpsx_ge_f64">;
+defm V_CMPSX_GT_F64 : VOPCX_F64 <vopc<0x74>, "v_cmpsx_gt_f64">;
+defm V_CMPSX_LG_F64 : VOPCX_F64 <vopc<0x75>, "v_cmpsx_lg_f64">;
+defm V_CMPSX_GE_F64 : VOPCX_F64 <vopc<0x76>, "v_cmpsx_ge_f64">;
+defm V_CMPSX_O_F64 : VOPCX_F64 <vopc<0x77>, "v_cmpsx_o_f64">;
+defm V_CMPSX_U_F64 : VOPCX_F64 <vopc<0x78>, "v_cmpsx_u_f64">;
+defm V_CMPSX_NGE_F64 : VOPCX_F64 <vopc<0x79>, "v_cmpsx_nge_f64", "v_cmpsx_nle_f64">;
+defm V_CMPSX_NLG_F64 : VOPCX_F64 <vopc<0x7a>, "v_cmpsx_nlg_f64">;
+defm V_CMPSX_NGT_F64 : VOPCX_F64 <vopc<0x7b>, "v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">;
+defm V_CMPSX_NLE_F64 : VOPCX_F64 <vopc<0x7c>, "v_cmpsx_nle_f64">;
+defm V_CMPSX_NEQ_F64 : VOPCX_F64 <vopc<0x7d>, "v_cmpsx_neq_f64">;
+defm V_CMPSX_NLT_F64 : VOPCX_F64 <vopc<0x7e>, "v_cmpsx_nlt_f64">;
+defm V_CMPSX_TRU_F64 : VOPCX_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">;
+
+} // End SubtargetPredicate = isSICI
+
+defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80, 0xc0>, "v_cmp_f_i32">;
+defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">;
+defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82, 0xc2>, "v_cmp_eq_i32", COND_EQ>;
+defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">;
+defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84, 0xc4>, "v_cmp_gt_i32", COND_SGT>;
+defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85, 0xc5>, "v_cmp_ne_i32", COND_NE>;
+defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86, 0xc6>, "v_cmp_ge_i32", COND_SGE>;
+defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87, 0xc7>, "v_cmp_t_i32">;
+
+
+defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90, 0xd0>, "v_cmpx_f_i32">;
+defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32", "v_cmpx_gt_i32">;
+defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92, 0xd2>, "v_cmpx_eq_i32">;
+defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32", "v_cmpx_ge_i32">;
+defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94, 0xd4>, "v_cmpx_gt_i32">;
+defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95, 0xd5>, "v_cmpx_ne_i32">;
+defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96, 0xd6>, "v_cmpx_ge_i32">;
+defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97, 0xd7>, "v_cmpx_t_i32">;
+
+
+defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0, 0xe0>, "v_cmp_f_i64">;
+defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">;
+defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2, 0xe2>, "v_cmp_eq_i64", COND_EQ>;
+defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">;
+defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4, 0xe4>, "v_cmp_gt_i64", COND_SGT>;
+defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5, 0xe5>, "v_cmp_ne_i64", COND_NE>;
+defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6, 0xe6>, "v_cmp_ge_i64", COND_SGE>;
+defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7, 0xe7>, "v_cmp_t_i64">;
+
+
+defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0, 0xf0>, "v_cmpx_f_i64">;
+defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64", "v_cmpx_gt_i64">;
+defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2, 0xf2>, "v_cmpx_eq_i64">;
+defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64", "v_cmpx_ge_i64">;
+defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4, 0xf4>, "v_cmpx_gt_i64">;
+defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5, 0xf5>, "v_cmpx_ne_i64">;
+defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6, 0xf6>, "v_cmpx_ge_i64">;
+defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7, 0xf7>, "v_cmpx_t_i64">;
+
+
+defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0, 0xc8>, "v_cmp_f_u32">;
+defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">;
+defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2, 0xca>, "v_cmp_eq_u32", COND_EQ>;
+defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">;
+defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4, 0xcc>, "v_cmp_gt_u32", COND_UGT>;
+defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5, 0xcd>, "v_cmp_ne_u32", COND_NE>;
+defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6, 0xce>, "v_cmp_ge_u32", COND_UGE>;
+defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7, 0xcf>, "v_cmp_t_u32">;
+
+
+defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0, 0xd8>, "v_cmpx_f_u32">;
+defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32", "v_cmpx_gt_u32">;
+defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2, 0xda>, "v_cmpx_eq_u32">;
+defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32", "v_cmpx_le_u32">;
+defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4, 0xdc>, "v_cmpx_gt_u32">;
+defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5, 0xdd>, "v_cmpx_ne_u32">;
+defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6, 0xde>, "v_cmpx_ge_u32">;
+defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7, 0xdf>, "v_cmpx_t_u32">;
+
+
+defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0, 0xe8>, "v_cmp_f_u64">;
+defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">;
+defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2, 0xea>, "v_cmp_eq_u64", COND_EQ>;
+defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">;
+defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4, 0xec>, "v_cmp_gt_u64", COND_UGT>;
+defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5, 0xed>, "v_cmp_ne_u64", COND_NE>;
+defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6, 0xee>, "v_cmp_ge_u64", COND_UGE>;
+defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7, 0xef>, "v_cmp_t_u64">;
+
+defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0, 0xf8>, "v_cmpx_f_u64">;
+defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64", "v_cmpx_gt_u64">;
+defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2, 0xfa>, "v_cmpx_eq_u64">;
+defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64", "v_cmpx_ge_u64">;
+defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4, 0xfc>, "v_cmpx_gt_u64">;
+defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5, 0xfd>, "v_cmpx_ne_u64">;
+defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6, 0xfe>, "v_cmpx_ge_u64">;
+defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7, 0xff>, "v_cmpx_t_u64">;
+
+} // End isCompare = 1, isCommutable = 1
+
+defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <vopc<0x88, 0x10>, "v_cmp_class_f32">;
+defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <vopc<0x98, 0x11>, "v_cmpx_class_f32">;
+defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <vopc<0xa8, 0x12>, "v_cmp_class_f64">;
+defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">;
+
+//===----------------------------------------------------------------------===//
+// DS Instructions
+//===----------------------------------------------------------------------===//
+
+defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>;
+defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>;
+defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>;
+defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>;
+defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>;
+defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>;
+defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>;
+defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>;
+defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>;
+defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>;
+defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>;
+defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>;
+defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>;
+let mayLoad = 0 in {
+defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>;
+defm DS_WRITE2_B32 : DS_1A1D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>;
+defm DS_WRITE2ST64_B32 : DS_1A1D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>;
+}
+defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>;
+defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>;
+defm DS_MIN_F32 : DS_1A2D_NORET <0x12, "ds_min_f32", VGPR_32>;
+defm DS_MAX_F32 : DS_1A2D_NORET <0x13, "ds_max_f32", VGPR_32>;
+
+defm DS_GWS_INIT : DS_1A_GDS <0x19, "ds_gws_init">;
+defm DS_GWS_SEMA_V : DS_1A_GDS <0x1a, "ds_gws_sema_v">;
+defm DS_GWS_SEMA_BR : DS_1A_GDS <0x1b, "ds_gws_sema_br">;
+defm DS_GWS_SEMA_P : DS_1A_GDS <0x1c, "ds_gws_sema_p">;
+defm DS_GWS_BARRIER : DS_1A_GDS <0x1d, "ds_gws_barrier">;
+let mayLoad = 0 in {
+defm DS_WRITE_B8 : DS_1A1D_NORET <0x1e, "ds_write_b8", VGPR_32>;
+defm DS_WRITE_B16 : DS_1A1D_NORET <0x1f, "ds_write_b16", VGPR_32>;
+}
+defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">;
+defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">;
+defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">;
+defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">;
+defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">;
+defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">;
+defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">;
+defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">;
+defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">;
+defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">;
+defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">;
+defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">;
+defm DS_MSKOR_RTN_B32 : DS_1A2D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
+defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>;
+defm DS_WRXCHG2_RTN_B32 : DS_1A2D_RET <
+  0x2e, "ds_wrxchg2_rtn_b32", VReg_64, "", VGPR_32
+>;
+defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET <
+  0x2f, "ds_wrxchg2st64_rtn_b32", VReg_64, "", VGPR_32
+>;
+defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
+defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
+defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
+defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
+let SubtargetPredicate = isCI in {
+defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
+} // End isCI
+defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>;
+let mayStore = 0 in {
+defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>;
+defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>;
+defm DS_READ2ST64_B32 : DS_1A_Off8_RET <0x38, "ds_read2st64_b32", VReg_64>;
+defm DS_READ_I8 : DS_1A_RET <0x39, "ds_read_i8", VGPR_32>;
+defm DS_READ_U8 : DS_1A_RET <0x3a, "ds_read_u8", VGPR_32>;
+defm DS_READ_I16 : DS_1A_RET <0x3b, "ds_read_i16", VGPR_32>;
+defm DS_READ_U16 : DS_1A_RET <0x3c, "ds_read_u16", VGPR_32>;
+}
+defm DS_CONSUME : DS_0A_RET <0x3d, "ds_consume">;
+defm DS_APPEND : DS_0A_RET <0x3e, "ds_append">;
+defm DS_ORDERED_COUNT : DS_1A_RET_GDS <0x3f, "ds_ordered_count">;
+defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>;
+defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>;
+defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>;
+defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>;
+defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>;
+defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>;
+defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>;
+defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>;
+defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>;
+defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>;
+defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>;
+defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>;
+defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>;
+let mayLoad = 0 in {
+defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>;
+defm DS_WRITE2_B64 : DS_1A1D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>;
+defm DS_WRITE2ST64_B64 : DS_1A1D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>;
+}
+defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>;
+defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>;
+defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>;
+defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>;
+
+defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">;
+defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">;
+defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">;
+defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">;
+defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">;
+defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">;
+defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">;
+defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">;
+defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">;
+defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">;
+defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">;
+defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">;
+defm DS_MSKOR_RTN_B64 : DS_1A2D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">;
+defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">;
+defm DS_WRXCHG2_RTN_B64 : DS_1A2D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_128, "ds_wrxchg2_b64", VReg_64>;
+defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET <0x6f, "ds_wrxchg2st64_rtn_b64", VReg_128, "ds_wrxchg2st64_b64", VReg_64>;
+defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">;
+defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">;
+defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">;
+defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">;
+
+let mayStore = 0 in {
+defm DS_READ_B64 : DS_1A_RET <0x76, "ds_read_b64", VReg_64>;
+defm DS_READ2_B64 : DS_1A_Off8_RET <0x77, "ds_read2_b64", VReg_128>;
+defm DS_READ2ST64_B64 : DS_1A_Off8_RET <0x78, "ds_read2st64_b64", VReg_128>;
+}
+
+defm DS_ADD_SRC2_U32 : DS_1A <0x80, "ds_add_src2_u32">;
+defm DS_SUB_SRC2_U32 : DS_1A <0x81, "ds_sub_src2_u32">;
+defm DS_RSUB_SRC2_U32 : DS_1A <0x82, "ds_rsub_src2_u32">;
+defm DS_INC_SRC2_U32 : DS_1A <0x83, "ds_inc_src2_u32">;
+defm DS_DEC_SRC2_U32 : DS_1A <0x84, "ds_dec_src2_u32">;
+defm DS_MIN_SRC2_I32 : DS_1A <0x85, "ds_min_src2_i32">;
+defm DS_MAX_SRC2_I32 : DS_1A <0x86, "ds_max_src2_i32">;
+defm DS_MIN_SRC2_U32 : DS_1A <0x87, "ds_min_src2_u32">;
+defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">;
+defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">;
+defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">;
+defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">;
+defm DS_WRITE_SRC2_B32 : DS_1A <0x8c, "ds_write_src2_b32">;
+
+defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">;
+defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">;
+
+defm DS_ADD_SRC2_U64 : DS_1A <0xc0, "ds_add_src2_u64">;
+defm DS_SUB_SRC2_U64 : DS_1A <0xc1, "ds_sub_src2_u64">;
+defm DS_RSUB_SRC2_U64 : DS_1A <0xc2, "ds_rsub_src2_u64">;
+defm DS_INC_SRC2_U64 : DS_1A <0xc3, "ds_inc_src2_u64">;
+defm DS_DEC_SRC2_U64 : DS_1A <0xc4, "ds_dec_src2_u64">;
+defm DS_MIN_SRC2_I64 : DS_1A <0xc5, "ds_min_src2_i64">;
+defm DS_MAX_SRC2_I64 : DS_1A <0xc6, "ds_max_src2_i64">;
+defm DS_MIN_SRC2_U64 : DS_1A <0xc7, "ds_min_src2_u64">;
+defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">;
+defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">;
+defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">;
+defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">;
+defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">;
+
+defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">;
+defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">;
+
+//let SubtargetPredicate = isCI in {
+// DS_CONDXCHG32_RTN_B64
+// DS_CONDXCHG32_RTN_B128
+//} // End isCI
+
+//===----------------------------------------------------------------------===//
+// MUBUF Instructions
+//===----------------------------------------------------------------------===//
+
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper <
+  mubuf<0x00>, "buffer_load_format_x", VGPR_32
+>;
+defm BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper <
+  mubuf<0x01>, "buffer_load_format_xy", VReg_64
+>;
+defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper <
+  mubuf<0x02>, "buffer_load_format_xyz", VReg_96
+>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <
+  mubuf<0x03>, "buffer_load_format_xyzw", VReg_128
+>;
+defm BUFFER_STORE_FORMAT_X : MUBUF_Store_Helper <
+  mubuf<0x04>, "buffer_store_format_x", VGPR_32
+>;
+defm BUFFER_STORE_FORMAT_XY : MUBUF_Store_Helper <
+  mubuf<0x05>, "buffer_store_format_xy", VReg_64
+>;
+defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Store_Helper <
+  mubuf<0x06>, "buffer_store_format_xyz", VReg_96
+>;
+defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper <
+  mubuf<0x07>, "buffer_store_format_xyzw", VReg_128
+>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <
+  mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global
+>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <
+  mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global
+>;
+defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <
+  mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global
+>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
+  mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global
+>;
+defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
+  mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load
+>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
+  mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load
+>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
+  mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load
+>;
+
+defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
+  mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global
+>;
+
+defm BUFFER_STORE_SHORT : MUBUF_Store_Helper <
+  mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global
+>;
+
+defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
+  mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store
+>;
+
+defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
+  mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store
+>;
+
+defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
+  mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store
+>;
+
+defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic <
+  mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
+>;
+//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", []>;
+defm BUFFER_ATOMIC_ADD : MUBUF_Atomic <
+  mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global
+>;
+defm BUFFER_ATOMIC_SUB : MUBUF_Atomic <
+  mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global
+>;
+//def BUFFER_ATOMIC_RSUB : MUBUF_ <mubuf<0x34>, "buffer_atomic_rsub", []>; // isn't on CI & VI
+defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic <
+  mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global
+>;
+defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic <
+  mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global
+>;
+defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic <
+  mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global
+>;
+defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic <
+  mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global
+>;
+defm BUFFER_ATOMIC_AND : MUBUF_Atomic <
+  mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global
+>;
+defm BUFFER_ATOMIC_OR : MUBUF_Atomic <
+  mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global
+>;
+defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
+  mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
+>;
+//def BUFFER_ATOMIC_INC : MUBUF_ <mubuf<0x3c, 0x4b>, "buffer_atomic_inc", []>;
+//def BUFFER_ATOMIC_DEC : MUBUF_ <mubuf<0x3d, 0x4c>, "buffer_atomic_dec", []>;
+//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMIN : MUBUF_ <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMAX : MUBUF_ <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI
+//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", []>;
+//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", []>;
+//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <mubuf<0x52, 0x62>, "buffer_atomic_add_x2", []>;
+//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", []>;
+//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI
+//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", []>;
+//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", []>;
+//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", []>;
+//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", []>;
+//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <mubuf<0x59, 0x68>, "buffer_atomic_and_x2", []>;
+//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", []>;
+//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", []>;
+//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", []>;
+//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", []>;
+//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI
+//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI
+//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI
+//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Instructions
+//===----------------------------------------------------------------------===//
+
+//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "tbuffer_load_format_x", []>;
+//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>;
+//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>;
+
+//===----------------------------------------------------------------------===//
+// MIMG Instructions
+//===----------------------------------------------------------------------===//
+
+defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">;
+//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>;
+//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>;
+//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>;
+//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>;
+//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>;
+//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>;
+//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
+//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
+//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>;
+//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>;
+//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>;
+//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>;
+//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>;
+//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>;
+//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>;
+//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>;
+//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>;
+//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>;
+//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>;
+//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>;
+//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>;
+//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>;
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>;
+//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>;
+//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>;
+defm IMAGE_SAMPLE           : MIMG_Sampler_WQM <0x00000020, "image_sample">;
+defm IMAGE_SAMPLE_CL        : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
+defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, "image_sample_d">;
+defm IMAGE_SAMPLE_D_CL      : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
+defm IMAGE_SAMPLE_L         : MIMG_Sampler <0x00000024, "image_sample_l">;
+defm IMAGE_SAMPLE_B         : MIMG_Sampler_WQM <0x00000025, "image_sample_b">;
+defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">;
+defm IMAGE_SAMPLE_LZ        : MIMG_Sampler <0x00000027, "image_sample_lz">;
+defm IMAGE_SAMPLE_C         : MIMG_Sampler_WQM <0x00000028, "image_sample_c">;
+defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">;
+defm IMAGE_SAMPLE_C_D       : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
+defm IMAGE_SAMPLE_C_D_CL    : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
+defm IMAGE_SAMPLE_C_L       : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
+defm IMAGE_SAMPLE_C_B       : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">;
+defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">;
+defm IMAGE_SAMPLE_C_LZ      : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
+defm IMAGE_SAMPLE_O         : MIMG_Sampler_WQM <0x00000030, "image_sample_o">;
+defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">;
+defm IMAGE_SAMPLE_D_O       : MIMG_Sampler <0x00000032, "image_sample_d_o">;
+defm IMAGE_SAMPLE_D_CL_O    : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
+defm IMAGE_SAMPLE_L_O       : MIMG_Sampler <0x00000034, "image_sample_l_o">;
+defm IMAGE_SAMPLE_B_O       : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">;
+defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">;
+defm IMAGE_SAMPLE_LZ_O      : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
+defm IMAGE_SAMPLE_C_O       : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">;
+defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">;
+defm IMAGE_SAMPLE_C_D_O     : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
+defm IMAGE_SAMPLE_C_D_CL_O  : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
+defm IMAGE_SAMPLE_C_L_O     : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
+defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">;
+defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">;
+defm IMAGE_SAMPLE_C_LZ_O    : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
+defm IMAGE_GATHER4          : MIMG_Gather_WQM <0x00000040, "image_gather4">;
+defm IMAGE_GATHER4_CL       : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">;
+defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, "image_gather4_l">;
+defm IMAGE_GATHER4_B        : MIMG_Gather_WQM <0x00000045, "image_gather4_b">;
+defm IMAGE_GATHER4_B_CL     : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">;
+defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, "image_gather4_lz">;
+defm IMAGE_GATHER4_C        : MIMG_Gather_WQM <0x00000048, "image_gather4_c">;
+defm IMAGE_GATHER4_C_CL     : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">;
+defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
+defm IMAGE_GATHER4_C_B      : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">;
+defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">;
+defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
+defm IMAGE_GATHER4_O        : MIMG_Gather_WQM <0x00000050, "image_gather4_o">;
+defm IMAGE_GATHER4_CL_O     : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">;
+defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, "image_gather4_l_o">;
+defm IMAGE_GATHER4_B_O      : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">;
+defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
+defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
+defm IMAGE_GATHER4_C_O      : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">;
+defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">;
+defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
+defm IMAGE_GATHER4_C_B_O    : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
+defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
+defm IMAGE_GET_LOD          : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
+defm IMAGE_SAMPLE_CD        : MIMG_Sampler <0x00000068, "image_sample_cd">;
+defm IMAGE_SAMPLE_CD_CL     : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
+defm IMAGE_SAMPLE_C_CD      : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
+defm IMAGE_SAMPLE_C_CD_CL   : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">;
+defm IMAGE_SAMPLE_CD_O      : MIMG_Sampler <0x0000006c, "image_sample_cd_o">;
+defm IMAGE_SAMPLE_CD_CL_O   : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">;
+defm IMAGE_SAMPLE_C_CD_O    : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">;
+//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
+//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
+
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+let vdst = 0, src0 = 0 in {
+defm V_NOP : VOP1_m <vop1<0x0>, (outs), (ins), "v_nop", [], "v_nop">;
+}
+
+let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>;
+} // End isMoveImm = 1
+
+let Uses = [EXEC] in {
+
+// FIXME: Specify SchedRW for READFIRSTLANE_B32
+
+def V_READFIRSTLANE_B32 : VOP1 <
+  0x00000002,
+  (outs SReg_32:$vdst),
+  (ins VGPR_32:$src0),
+  "v_readfirstlane_b32 $vdst, $src0",
+  []
+>;
+
+}
+
+let SchedRW = [WriteQuarterRate32] in {
+
+defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64",
+  VOP_I32_F64, fp_to_sint
+>;
+defm V_CVT_F64_I32 : VOP1Inst <vop1<0x4>, "v_cvt_f64_i32",
+  VOP_F64_I32, sint_to_fp
+>;
+defm V_CVT_F32_I32 : VOP1Inst <vop1<0x5>, "v_cvt_f32_i32",
+  VOP_F32_I32, sint_to_fp
+>;
+defm V_CVT_F32_U32 : VOP1Inst <vop1<0x6>, "v_cvt_f32_u32",
+  VOP_F32_I32, uint_to_fp
+>;
+defm V_CVT_U32_F32 : VOP1Inst <vop1<0x7>, "v_cvt_u32_f32",
+  VOP_I32_F32, fp_to_uint
+>;
+defm V_CVT_I32_F32 : VOP1Inst <vop1<0x8>, "v_cvt_i32_f32",
+  VOP_I32_F32, fp_to_sint
+>;
+defm V_CVT_F16_F32 : VOP1Inst <vop1<0xa>, "v_cvt_f16_f32",
+  VOP_I32_F32, fp_to_f16
+>;
+defm V_CVT_F32_F16 : VOP1Inst <vop1<0xb>, "v_cvt_f32_f16",
+  VOP_F32_I32, f16_to_fp
+>;
+defm V_CVT_RPI_I32_F32 : VOP1Inst <vop1<0xc>, "v_cvt_rpi_i32_f32",
+  VOP_I32_F32, cvt_rpi_i32_f32>;
+defm V_CVT_FLR_I32_F32 : VOP1Inst <vop1<0xd>, "v_cvt_flr_i32_f32",
+  VOP_I32_F32, cvt_flr_i32_f32>;
+defm V_CVT_OFF_F32_I4 : VOP1Inst  <vop1<0x0e>, "v_cvt_off_f32_i4", VOP_F32_I32>;
+defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64",
+  VOP_F32_F64, fround
+>;
+defm V_CVT_F64_F32 : VOP1Inst <vop1<0x10>, "v_cvt_f64_f32",
+  VOP_F64_F32, fextend
+>;
+defm V_CVT_F32_UBYTE0 : VOP1Inst <vop1<0x11>, "v_cvt_f32_ubyte0",
+  VOP_F32_I32, AMDGPUcvt_f32_ubyte0
+>;
+defm V_CVT_F32_UBYTE1 : VOP1Inst <vop1<0x12>, "v_cvt_f32_ubyte1",
+  VOP_F32_I32, AMDGPUcvt_f32_ubyte1
+>;
+defm V_CVT_F32_UBYTE2 : VOP1Inst <vop1<0x13>, "v_cvt_f32_ubyte2",
+  VOP_F32_I32, AMDGPUcvt_f32_ubyte2
+>;
+defm V_CVT_F32_UBYTE3 : VOP1Inst <vop1<0x14>, "v_cvt_f32_ubyte3",
+  VOP_F32_I32, AMDGPUcvt_f32_ubyte3
+>;
+defm V_CVT_U32_F64 : VOP1Inst <vop1<0x15>, "v_cvt_u32_f64",
+  VOP_I32_F64, fp_to_uint
+>;
+defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32",
+  VOP_F64_I32, uint_to_fp
+>;
+
+} // let SchedRW = [WriteQuarterRate32]
+
+defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32",
+  VOP_F32_F32, AMDGPUfract
+>;
+defm V_TRUNC_F32 : VOP1Inst <vop1<0x21, 0x1c>, "v_trunc_f32",
+  VOP_F32_F32, ftrunc
+>;
+defm V_CEIL_F32 : VOP1Inst <vop1<0x22, 0x1d>, "v_ceil_f32",
+  VOP_F32_F32, fceil
+>;
+defm V_RNDNE_F32 : VOP1Inst <vop1<0x23, 0x1e>, "v_rndne_f32",
+  VOP_F32_F32, frint
+>;
+defm V_FLOOR_F32 : VOP1Inst <vop1<0x24, 0x1f>, "v_floor_f32",
+  VOP_F32_F32, ffloor
+>;
+defm V_EXP_F32 : VOP1Inst <vop1<0x25, 0x20>, "v_exp_f32",
+  VOP_F32_F32, fexp2
+>;
+
+let SchedRW = [WriteQuarterRate32] in {
+
+defm V_LOG_F32 : VOP1Inst <vop1<0x27, 0x21>, "v_log_f32",
+  VOP_F32_F32, flog2
+>;
+defm V_RCP_F32 : VOP1Inst <vop1<0x2a, 0x22>, "v_rcp_f32",
+  VOP_F32_F32, AMDGPUrcp
+>;
+defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b, 0x23>, "v_rcp_iflag_f32",
+  VOP_F32_F32
+>;
+defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32",
+  VOP_F32_F32, AMDGPUrsq
+>;
+
+} //let SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+
+defm V_RCP_F64 : VOP1Inst <vop1<0x2f, 0x25>, "v_rcp_f64",
+  VOP_F64_F64, AMDGPUrcp
+>;
+defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64",
+  VOP_F64_F64, AMDGPUrsq
+>;
+
+} // let SchedRW = [WriteDouble];
+
+defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32",
+  VOP_F32_F32, fsqrt
+>;
+
+let SchedRW = [WriteDouble] in {
+
+defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64",
+  VOP_F64_F64, fsqrt
+>;
+
+} // let SchedRW = [WriteDouble]
+
+defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
+  VOP_F32_F32, AMDGPUsin
+>;
+defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32",
+  VOP_F32_F32, AMDGPUcos
+>;
+defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>;
+defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>;
+defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>;
+defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>;
+defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>;
+defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64",
+  VOP_I32_F64
+>;
+defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64",
+  VOP_F64_F64
+>;
+defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>;
+defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32",
+  VOP_I32_F32
+>;
+defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32",
+  VOP_F32_F32
+>;
+let vdst = 0, src0 = 0 in {
+defm V_CLREXCP : VOP1_m <vop1<0x41,0x35>, (outs), (ins), "v_clrexcp", [],
+  "v_clrexcp"
+>;
+}
+defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>;
+defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>;
+defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>;
+
+// These instruction only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+let SchedRW = [WriteQuarterRate32] in {
+
+defm V_MOV_FED_B32 : VOP1InstSI <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>;
+defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>;
+defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>;
+defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>;
+defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32",
+  VOP_F32_F32, AMDGPUrsq_clamped
+>;
+defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32",
+  VOP_F32_F32, AMDGPUrsq_legacy
+>;
+
+} // End let SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+
+defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>;
+defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64",
+  VOP_F64_F64, AMDGPUrsq_clamped
+>;
+
+} // End SchedRW = [WriteDouble]
+
+} // End SubtargetPredicate = isSICI
+
+//===----------------------------------------------------------------------===//
+// VINTRP Instructions
+//===----------------------------------------------------------------------===//
+
+let Uses = [M0] in {
+
+// FIXME: Specify SchedRW for VINTRP insturctions.
+
+multiclass V_INTERP_P1_F32_m : VINTRP_m <
+  0x00000000,
+  (outs VGPR_32:$dst),
+  (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr),
+  "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [m0]",
+  [(set f32:$dst, (AMDGPUinterp_p1 i32:$i, (i32 imm:$attr_chan),
+                                           (i32 imm:$attr)))]
+>;
+
+let OtherPredicates = [has32BankLDS] in {
+
+defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
+
+} // End OtherPredicates = [has32BankLDS]
+
+let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst" in {
+
+defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
+
+} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst"
+
+let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in {
+
+defm V_INTERP_P2_F32 : VINTRP_m <
+  0x00000001,
+  (outs VGPR_32:$dst),
+  (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr),
+  "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [m0]",
+  [(set f32:$dst, (AMDGPUinterp_p2 f32:$src0, i32:$j, (i32 imm:$attr_chan),
+                                                     (i32 imm:$attr)))]>;
+
+} // End DisableEncoding = "$src0", Constraints = "$src0 = $dst"
+
+defm V_INTERP_MOV_F32 : VINTRP_m <
+  0x00000002,
+  (outs VGPR_32:$dst),
+  (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr),
+  "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [m0]",
+  [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan),
+                                    (i32 imm:$attr)))]>;
+
+} // End Uses = [M0]
+
+//===----------------------------------------------------------------------===//
+// VOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass V_CNDMASK <vop2 op, string name> {
+  defm _e32 : VOP2_m <
+      op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [],
+      name, name>;
+
+  defm _e64  : VOP3_m <
+      op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64,
+      name#!cast<string>(VOP_CNDMASK.Asm64), [], name, 3>;
+}
+
+defm V_CNDMASK_B32 : V_CNDMASK<vop2<0x0>, "v_cndmask_b32">;
+
+let isCommutable = 1 in {
+defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32",
+  VOP_F32_F32_F32, fadd
+>;
+
+defm V_SUB_F32 : VOP2Inst <vop2<0x4, 0x2>, "v_sub_f32", VOP_F32_F32_F32, fsub>;
+defm V_SUBREV_F32 : VOP2Inst <vop2<0x5, 0x3>, "v_subrev_f32",
+  VOP_F32_F32_F32, null_frag, "v_sub_f32"
+>;
+} // End isCommutable = 1
+
+let isCommutable = 1 in {
+
+defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7, 0x4>, "v_mul_legacy_f32",
+  VOP_F32_F32_F32, int_AMDGPU_mul
+>;
+
+defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32",
+  VOP_F32_F32_F32, fmul
+>;
+
+defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9, 0x6>, "v_mul_i32_i24",
+  VOP_I32_I32_I32, AMDGPUmul_i24
+>;
+
+defm V_MUL_HI_I32_I24 : VOP2Inst <vop2<0xa,0x7>, "v_mul_hi_i32_i24",
+  VOP_I32_I32_I32
+>;
+
+defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb, 0x8>, "v_mul_u32_u24",
+  VOP_I32_I32_I32, AMDGPUmul_u24
+>;
+
+defm V_MUL_HI_U32_U24 : VOP2Inst <vop2<0xc,0x9>, "v_mul_hi_u32_u24",
+ VOP_I32_I32_I32
+>;
+
+defm V_MIN_F32 : VOP2Inst <vop2<0xf, 0xa>, "v_min_f32", VOP_F32_F32_F32,
+  fminnum>;
+defm V_MAX_F32 : VOP2Inst <vop2<0x10, 0xb>, "v_max_f32", VOP_F32_F32_F32,
+  fmaxnum>;
+defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32>;
+defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32>;
+defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32>;
+defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32>;
+
+defm V_LSHRREV_B32 : VOP2Inst <
+  vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag,
+    "v_lshr_b32"
+>;
+
+defm V_ASHRREV_I32 : VOP2Inst <
+  vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag,
+    "v_ashr_i32"
+>;
+
+defm V_LSHLREV_B32 : VOP2Inst <
+  vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag,
+    "v_lshl_b32"
+>;
+
+defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>;
+defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>;
+defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>;
+
+defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>;
+} // End isCommutable = 1
+
+defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">;
+
+let isCommutable = 1 in {
+defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">;
+} // End isCommutable = 1
+
+let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
+// No patterns so that the scalar instructions are always selected.
+// The scalar versions will be replaced with vector when needed later.
+
+// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
+// but the VI instructions behave the same as the SI versions.
+defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32",
+  VOP_I32_I32_I32, add
+>;
+defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>;
+
+defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32",
+  VOP_I32_I32_I32, null_frag, "v_sub_i32"
+>;
+
+let Uses = [VCC] in { // Carry-in comes from VCC
+defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32",
+  VOP_I32_I32_I32_VCC
+>;
+defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32",
+  VOP_I32_I32_I32_VCC
+>;
+defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32",
+  VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32"
+>;
+
+} // End Uses = [VCC]
+} // End isCommutable = 1, Defs = [VCC]
+
+defm V_READLANE_B32 : VOP2SI_3VI_m <
+  vop3 <0x001, 0x289>,
+  "v_readlane_b32",
+  (outs SReg_32:$vdst),
+  (ins VGPR_32:$src0, SCSrc_32:$src1),
+  "v_readlane_b32 $vdst, $src0, $src1"
+>;
+
+defm V_WRITELANE_B32 : VOP2SI_3VI_m <
+  vop3 <0x002, 0x28a>,
+  "v_writelane_b32",
+  (outs VGPR_32:$vdst),
+  (ins SReg_32:$src0, SCSrc_32:$src1),
+  "v_writelane_b32 $vdst, $src0, $src1"
+>;
+
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+defm V_MIN_LEGACY_F32 : VOP2InstSI <vop2<0xd>, "v_min_legacy_f32",
+  VOP_F32_F32_F32, AMDGPUfmin_legacy
+>;
+defm V_MAX_LEGACY_F32 : VOP2InstSI <vop2<0xe>, "v_max_legacy_f32",
+  VOP_F32_F32_F32, AMDGPUfmax_legacy
+>;
+
+let isCommutable = 1 in {
+defm V_LSHR_B32 : VOP2InstSI <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32>;
+defm V_ASHR_I32 : VOP2InstSI <vop2<0x17>, "v_ashr_i32", VOP_I32_I32_I32>;
+defm V_LSHL_B32 : VOP2InstSI <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32>;
+} // End isCommutable = 1
+} // End let SubtargetPredicate = SICI
+
+let isCommutable = 1 in {
+defm V_MAC_LEGACY_F32 : VOP2_VI3_Inst <vop23<0x6, 0x28e>, "v_mac_legacy_f32",
+  VOP_F32_F32_F32
+>;
+} // End isCommutable = 1
+
+defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32",
+  VOP_I32_I32_I32
+>;
+defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32",
+  VOP_I32_I32_I32
+>;
+defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32",
+  VOP_I32_I32_I32
+>;
+defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32",
+  VOP_I32_I32_I32
+>;
+defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32",
+  VOP_F32_F32_I32, AMDGPUldexp
+>;
+
+defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst <vop23<0x2c, 0x1f0>, "v_cvt_pkaccum_u8_f32",
+  VOP_I32_F32_I32>; // TODO: set "Uses = dst"
+
+defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst <vop23<0x2d, 0x294>, "v_cvt_pknorm_i16_f32",
+  VOP_I32_F32_F32
+>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst <vop23<0x2e, 0x295>, "v_cvt_pknorm_u16_f32",
+  VOP_I32_F32_F32
+>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst <vop23<0x2f, 0x296>, "v_cvt_pkrtz_f16_f32",
+  VOP_I32_F32_F32, int_SI_packf16
+>;
+defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst <vop23<0x30, 0x297>, "v_cvt_pk_u16_u32",
+  VOP_I32_I32_I32
+>;
+defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst <vop23<0x31, 0x298>, "v_cvt_pk_i16_i32",
+  VOP_I32_I32_I32
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
+
+let isCommutable = 1 in {
+defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140, 0x1c0>, "v_mad_legacy_f32",
+  VOP_F32_F32_F32_F32
+>;
+
+defm V_MAD_F32 : VOP3Inst <vop3<0x141, 0x1c1>, "v_mad_f32",
+  VOP_F32_F32_F32_F32, fmad
+>;
+
+defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142, 0x1c2>, "v_mad_i32_i24",
+  VOP_I32_I32_I32_I32, AMDGPUmad_i24
+>;
+defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143, 0x1c3>, "v_mad_u32_u24",
+  VOP_I32_I32_I32_I32, AMDGPUmad_u24
+>;
+} // End isCommutable = 1
+
+defm V_CUBEID_F32 : VOP3Inst <vop3<0x144, 0x1c4>, "v_cubeid_f32",
+  VOP_F32_F32_F32_F32
+>;
+defm V_CUBESC_F32 : VOP3Inst <vop3<0x145, 0x1c5>, "v_cubesc_f32",
+  VOP_F32_F32_F32_F32
+>;
+defm V_CUBETC_F32 : VOP3Inst <vop3<0x146, 0x1c6>, "v_cubetc_f32",
+  VOP_F32_F32_F32_F32
+>;
+defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32",
+  VOP_F32_F32_F32_F32
+>;
+
+defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32",
+  VOP_I32_I32_I32_I32, AMDGPUbfe_u32
+>;
+defm V_BFE_I32 : VOP3Inst <vop3<0x149, 0x1c9>, "v_bfe_i32",
+  VOP_I32_I32_I32_I32, AMDGPUbfe_i32
+>;
+
+defm V_BFI_B32 : VOP3Inst <vop3<0x14a, 0x1ca>, "v_bfi_b32",
+  VOP_I32_I32_I32_I32, AMDGPUbfi
+>;
+
+let isCommutable = 1 in {
+defm V_FMA_F32 : VOP3Inst <vop3<0x14b, 0x1cb>, "v_fma_f32",
+  VOP_F32_F32_F32_F32, fma
+>;
+defm V_FMA_F64 : VOP3Inst <vop3<0x14c, 0x1cc>, "v_fma_f64",
+  VOP_F64_F64_F64_F64, fma
+>;
+} // End isCommutable = 1
+
+//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>;
+defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e, 0x1ce>, "v_alignbit_b32",
+  VOP_I32_I32_I32_I32
+>;
+defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32",
+  VOP_I32_I32_I32_I32
+>;
+
+defm V_MIN3_F32 : VOP3Inst <vop3<0x151, 0x1d0>, "v_min3_f32",
+  VOP_F32_F32_F32_F32, AMDGPUfmin3>;
+
+defm V_MIN3_I32 : VOP3Inst <vop3<0x152, 0x1d1>, "v_min3_i32",
+  VOP_I32_I32_I32_I32, AMDGPUsmin3
+>;
+defm V_MIN3_U32 : VOP3Inst <vop3<0x153, 0x1d2>, "v_min3_u32",
+  VOP_I32_I32_I32_I32, AMDGPUumin3
+>;
+defm V_MAX3_F32 : VOP3Inst <vop3<0x154, 0x1d3>, "v_max3_f32",
+  VOP_F32_F32_F32_F32, AMDGPUfmax3
+>;
+defm V_MAX3_I32 : VOP3Inst <vop3<0x155, 0x1d4>, "v_max3_i32",
+  VOP_I32_I32_I32_I32, AMDGPUsmax3
+>;
+defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32",
+  VOP_I32_I32_I32_I32, AMDGPUumax3
+>;
+defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32",
+  VOP_F32_F32_F32_F32
+>;
+defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32",
+  VOP_I32_I32_I32_I32
+>;
+defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32",
+  VOP_I32_I32_I32_I32
+>;
+
+//def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>;
+//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>;
+//def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>;
+defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32",
+  VOP_I32_I32_I32_I32
+>;
+////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>;
+defm V_DIV_FIXUP_F32 : VOP3Inst <
+  vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
+>;
+
+let SchedRW = [WriteDouble] in {
+
+defm V_DIV_FIXUP_F64 : VOP3Inst <
+  vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
+>;
+
+} // let SchedRW = [WriteDouble]
+
+let SchedRW = [WriteDouble] in {
+let isCommutable = 1 in {
+
+defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64",
+  VOP_F64_F64_F64, fadd
+>;
+defm V_MUL_F64 : VOP3Inst <vop3<0x165, 0x281>, "v_mul_f64",
+  VOP_F64_F64_F64, fmul
+>;
+
+defm V_MIN_F64 : VOP3Inst <vop3<0x166, 0x282>, "v_min_f64",
+  VOP_F64_F64_F64, fminnum
+>;
+defm V_MAX_F64 : VOP3Inst <vop3<0x167, 0x283>, "v_max_f64",
+  VOP_F64_F64_F64, fmaxnum
+>;
+
+} // isCommutable = 1
+
+defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64",
+  VOP_F64_F64_I32, AMDGPUldexp
+>;
+
+} // let SchedRW = [WriteDouble]
+
+let isCommutable = 1, SchedRW = [WriteQuarterRate32] in {
+
+defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32",
+  VOP_I32_I32_I32
+>;
+defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a, 0x286>, "v_mul_hi_u32",
+  VOP_I32_I32_I32
+>;
+
+defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b, 0x285>, "v_mul_lo_i32",
+  VOP_I32_I32_I32
+>;
+defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",
+  VOP_I32_I32_I32
+>;
+
+} // isCommutable = 1, SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteFloatFMA, WriteSALU] in {
+defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>;
+}
+
+let SchedRW = [WriteDouble, WriteSALU] in {
+// Double precision division pre-scale.
+defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>;
+} // let SchedRW = [WriteDouble]
+
+let isCommutable = 1, Uses = [VCC] in {
+
+// v_div_fmas_f32:
+//   result = src0 * src1 + src2
+//   if (vcc)
+//     result *= 2^32
+//
+defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32",
+  VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
+>;
+
+let SchedRW = [WriteDouble] in {
+// v_div_fmas_f64:
+//   result = src0 * src1 + src2
+//   if (vcc)
+//     result *= 2^64
+//
+defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64",
+  VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
+>;
+
+} // End SchedRW = [WriteDouble]
+} // End isCommutable = 1
+
+//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
+//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
+//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>;
+
+let SchedRW = [WriteDouble] in {
+defm V_TRIG_PREOP_F64 : VOP3Inst <
+  vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
+>;
+
+} // let SchedRW = [WriteDouble]
+
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", VOP_I64_I64_I32>;
+defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", VOP_I64_I64_I32>;
+defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", VOP_I64_I64_I32>;
+
+defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32",
+  VOP_F32_F32_F32_F32>;
+
+} // End SubtargetPredicate = isSICI
+
+let SubtargetPredicate = isVI in {
+
+defm V_LSHLREV_B64 : VOP3Inst <vop3<0, 0x28f>, "v_lshlrev_b64",
+  VOP_I64_I32_I64
+>;
+defm V_LSHRREV_B64 : VOP3Inst <vop3<0, 0x290>, "v_lshrrev_b64",
+  VOP_I64_I32_I64
+>;
+defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64",
+  VOP_I64_I32_I64
+>;
+
+} // End SubtargetPredicate = isVI
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+// For use in patterns
+def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst),
+  (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []
+>;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+// 64-bit vector move instruction.  This is mainly used by the SIFoldOperands
+// pass to enable folding of inline immediates.
+def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>;
+} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0
+
+let hasSideEffects = 1 in {
+def SGPR_USE : InstSI <(outs),(ins), "", []>;
+}
+
+// SI pseudo instructions. These are used by the CFG structurizer pass
+// and should be lowered to ISA instructions prior to codegen.
+
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
+let Uses = [EXEC], Defs = [EXEC] in {
+
+let isBranch = 1, isTerminator = 1 in {
+
+def SI_IF: InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$vcc, brtarget:$target),
+  "",
+  [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))]
+>;
+
+def SI_ELSE : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src, brtarget:$target),
+  "",
+  [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]
+> {
+  let Constraints = "$src = $dst";
+}
+
+def SI_LOOP : InstSI <
+  (outs),
+  (ins SReg_64:$saved, brtarget:$target),
+  "si_loop $saved, $target",
+  [(int_SI_loop i64:$saved, bb:$target)]
+>;
+
+} // end isBranch = 1, isTerminator = 1
+
+def SI_BREAK : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src),
+  "si_else $dst, $src",
+  [(set i64:$dst, (int_SI_break i64:$src))]
+>;
+
+def SI_IF_BREAK : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$vcc, SReg_64:$src),
+  "si_if_break $dst, $vcc, $src",
+  [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))]
+>;
+
+def SI_ELSE_BREAK : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src0, SReg_64:$src1),
+  "si_else_break $dst, $src0, $src1",
+  [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))]
+>;
+
+def SI_END_CF : InstSI <
+  (outs),
+  (ins SReg_64:$saved),
+  "si_end_cf $saved",
+  [(int_SI_end_cf i64:$saved)]
+>;
+
+} // End Uses = [EXEC], Defs = [EXEC]
+
+let Uses = [EXEC], Defs = [EXEC,VCC] in {
+def SI_KILL : InstSI <
+  (outs),
+  (ins VSrc_32:$src),
+  "si_kill $src",
+  [(int_AMDGPU_kill f32:$src)]
+>;
+} // End Uses = [EXEC], Defs = [EXEC,VCC]
+
+} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
+
+let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
+
+//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>;
+
+let UseNamedOperandTable = 1 in {
+
+def SI_RegisterLoad : InstSI <
+  (outs VGPR_32:$dst, SReg_64:$temp),
+  (ins FRAMEri32:$addr, i32imm:$chan),
+  "", []
+> {
+  let isRegisterLoad = 1;
+  let mayLoad = 1;
+}
+
+class SIRegStore<dag outs> : InstSI <
+  outs,
+  (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan),
+  "", []
+> {
+  let isRegisterStore = 1;
+  let mayStore = 1;
+}
+
+let usesCustomInserter = 1 in {
+def SI_RegisterStorePseudo : SIRegStore<(outs)>;
+} // End usesCustomInserter = 1
+def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
+
+
+} // End UseNamedOperandTable = 1
+
+def SI_INDIRECT_SRC : InstSI <
+  (outs VGPR_32:$dst, SReg_64:$temp),
+  (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
+  "si_indirect_src $dst, $temp, $src, $idx, $off",
+  []
+>;
+
+class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
+  (outs rc:$dst, SReg_64:$temp),
+  (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val),
+  "si_indirect_dst $dst, $temp, $src, $idx, $off, $val",
+  []
+> {
+  let Constraints = "$src = $dst";
+}
+
+def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
+def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
+def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
+def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
+def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
+
+} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
+
+multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
+
+  let UseNamedOperandTable = 1 in {
+    def _SAVE : InstSI <
+      (outs),
+      (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
+           SReg_32:$scratch_offset),
+      "", []
+    >;
+
+    def _RESTORE : InstSI <
+      (outs sgpr_class:$dst),
+      (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
+      "", []
+    >;
+  } // End UseNamedOperandTable = 1
+}
+
+// It's unclear whether you can use M0 as the output of v_readlane_b32
+// instructions, so use SGPR_32 register class for spills to prevent
+// this from happening.
+defm SI_SPILL_S32  : SI_SPILL_SGPR <SGPR_32>;
+defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
+defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
+defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
+defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+
+multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
+  let UseNamedOperandTable = 1, VGPRSpill = 1 in {
+    def _SAVE : InstSI <
+      (outs),
+      (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
+           SReg_32:$scratch_offset),
+      "", []
+    >;
+
+    def _RESTORE : InstSI <
+      (outs vgpr_class:$dst),
+      (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
+      "", []
+    >;
+  } // End UseNamedOperandTable = 1, VGPRSpill = 1
+}
+
+defm SI_SPILL_V32  : SI_SPILL_VGPR <VGPR_32>;
+defm SI_SPILL_V64  : SI_SPILL_VGPR <VReg_64>;
+defm SI_SPILL_V96  : SI_SPILL_VGPR <VReg_96>;
+defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
+defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
+defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
+
+let Defs = [SCC] in {
+
+def SI_CONSTDATA_PTR : InstSI <
+  (outs SReg_64:$dst),
+  (ins),
+  "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))]
+>;
+
+} // End Defs = [SCC]
+
+} // end IsCodeGenOnly, isPseudo
+
+} // end SubtargetPredicate = isGCN
+
+let Predicates = [isGCN] in {
+
+def : Pat<
+  (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2),
+  (V_CNDMASK_B32_e64 $src2, $src1,
+                     (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0,
+                                       DSTCLAMP.NONE, DSTOMOD.NONE))
+>;
+
+def : Pat <
+  (int_AMDGPU_kilp),
+  (SI_KILL 0xbf800000)
+>;
+
+/* int_SI_vs_load_input */
+def : Pat<
+  (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
+  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0)
+>;
+
+/* int_SI_export */
+def : Pat <
+  (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
+                 f32:$src0, f32:$src1, f32:$src2, f32:$src3),
+  (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm,
+       $src0, $src1, $src2, $src3)
+>;
+
+//===----------------------------------------------------------------------===//
+// SMRD Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+
+  // 1. SI-CI: Offset as 8bit DWORD immediate
+  def : Pat <
+    (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
+    (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
+  >;
+
+  // 2. Offset loaded in an 32bit SGPR
+  def : Pat <
+    (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
+    (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
+  >;
+
+  // 3. No offset at all
+  def : Pat <
+    (constant_load i64:$sbase),
+    (vt (Instr_IMM $sbase, 0))
+  >;
+}
+
+multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+
+  // 1. VI: Offset as 20bit immediate in bytes
+  def : Pat <
+    (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))),
+    (vt (Instr_IMM $sbase, (as_i32imm $offset)))
+  >;
+
+  // 2. Offset loaded in an 32bit SGPR
+  def : Pat <
+    (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
+    (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
+  >;
+
+  // 3. No offset at all
+  def : Pat <
+    (constant_load i64:$sbase),
+    (vt (Instr_IMM $sbase, 0))
+  >;
+}
+
+let Predicates = [isSICI] in {
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
+} // End Predicates = [isSICI]
+
+let Predicates = [isVI] in {
+defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
+} // End Predicates = [isVI]
+
+let Predicates = [isSICI] in {
+
+// 1. Offset as 8bit DWORD immediate
+def : Pat <
+  (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
+>;
+
+} // End Predicates = [isSICI]
+
+// 2. Offset loaded in an 32bit SGPR
+def : Pat <
+  (SIload_constant v4i32:$sbase, imm:$offset),
+  (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
+>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (i64 (ctpop i64:$src)),
+    (i64 (REG_SEQUENCE SReg_64,
+     (S_BCNT1_I32_B64 $src), sub0,
+     (S_MOV_B32 0), sub1))
+>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
+// case, the sgpr-copies pass will fix this to use the vector version.
+def : Pat <
+  (i32 (addc i32:$src0, i32:$src1)),
+  (S_ADD_U32 $src0, $src1)
+>;
+
+//===----------------------------------------------------------------------===//
+// SOPP Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (int_AMDGPU_barrier_global),
+  (S_BARRIER)
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP1 Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [UnsafeFPMath] in {
+
+//def : RcpPat<V_RCP_F64_e32, f64>;
+//defm : RsqPat<V_RSQ_F64_e32, f64>;
+//defm : RsqPat<V_RSQ_F32_e32, f32>;
+
+def : RsqPat<V_RSQ_F32_e32, f32>;
+def : RsqPat<V_RSQ_F64_e32, f64>;
+}
+
+//===----------------------------------------------------------------------===//
+// VOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
+  (V_BCNT_U32_B32_e64 $popcnt, $val)
+>;
+
+def : Pat <
+  (i32 (select i1:$src0, i32:$src1, i32:$src2)),
+  (V_CNDMASK_B32_e64 $src2, $src1, $src0)
+>;
+
+/********** ======================= **********/
+/********** Image sampling patterns **********/
+/********** ======================= **********/
+
+// Image + sampler
+class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
+        i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
+  (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
+          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
+          $addr, $rsrc, $sampler)
+>;
+
+multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>;
+}
+
+// Image only
+class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm,
+        i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
+  (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
+          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
+          $addr, $rsrc)
+>;
+
+multiclass ImagePatterns<SDPatternOperator name, string opcode> {
+  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+}
+
+// Basic sample
+defm : SampleRawPatterns<int_SI_image_sample,           "IMAGE_SAMPLE">;
+defm : SampleRawPatterns<int_SI_image_sample_cl,        "IMAGE_SAMPLE_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_d,         "IMAGE_SAMPLE_D">;
+defm : SampleRawPatterns<int_SI_image_sample_d_cl,      "IMAGE_SAMPLE_D_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_l,         "IMAGE_SAMPLE_L">;
+defm : SampleRawPatterns<int_SI_image_sample_b,         "IMAGE_SAMPLE_B">;
+defm : SampleRawPatterns<int_SI_image_sample_b_cl,      "IMAGE_SAMPLE_B_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_lz,        "IMAGE_SAMPLE_LZ">;
+defm : SampleRawPatterns<int_SI_image_sample_cd,        "IMAGE_SAMPLE_CD">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_cl,     "IMAGE_SAMPLE_CD_CL">;
+
+// Sample with comparison
+defm : SampleRawPatterns<int_SI_image_sample_c,         "IMAGE_SAMPLE_C">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cl,      "IMAGE_SAMPLE_C_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d,       "IMAGE_SAMPLE_C_D">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_cl,    "IMAGE_SAMPLE_C_D_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_l,       "IMAGE_SAMPLE_C_L">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b,       "IMAGE_SAMPLE_C_B">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_cl,    "IMAGE_SAMPLE_C_B_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_lz,      "IMAGE_SAMPLE_C_LZ">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd,      "IMAGE_SAMPLE_C_CD">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl,   "IMAGE_SAMPLE_C_CD_CL">;
+
+// Sample with offsets
+defm : SampleRawPatterns<int_SI_image_sample_o,         "IMAGE_SAMPLE_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cl_o,      "IMAGE_SAMPLE_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_d_o,       "IMAGE_SAMPLE_D_O">;
+defm : SampleRawPatterns<int_SI_image_sample_d_cl_o,    "IMAGE_SAMPLE_D_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_l_o,       "IMAGE_SAMPLE_L_O">;
+defm : SampleRawPatterns<int_SI_image_sample_b_o,       "IMAGE_SAMPLE_B_O">;
+defm : SampleRawPatterns<int_SI_image_sample_b_cl_o,    "IMAGE_SAMPLE_B_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_lz_o,      "IMAGE_SAMPLE_LZ_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_o,      "IMAGE_SAMPLE_CD_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o,   "IMAGE_SAMPLE_CD_CL_O">;
+
+// Sample with comparison and offsets
+defm : SampleRawPatterns<int_SI_image_sample_c_o,       "IMAGE_SAMPLE_C_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cl_o,    "IMAGE_SAMPLE_C_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_o,     "IMAGE_SAMPLE_C_D_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o,  "IMAGE_SAMPLE_C_D_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_l_o,     "IMAGE_SAMPLE_C_L_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_o,     "IMAGE_SAMPLE_C_B_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o,  "IMAGE_SAMPLE_C_B_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_lz_o,    "IMAGE_SAMPLE_C_LZ_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_o,    "IMAGE_SAMPLE_C_CD_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
+
+// Gather opcodes
+// Only the variants which make sense are defined.
+def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V2,        v2i32>;
+def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V4,        v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl,        IMAGE_GATHER4_CL_V4_V4,     v4i32>;
+def : SampleRawPattern<int_SI_gather4_l,         IMAGE_GATHER4_L_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_b,         IMAGE_GATHER4_B_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V4,   v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V8,   v8i32>;
+def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V2,     v2i32>;
+def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V4,     v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_c,         IMAGE_GATHER4_C_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V4,   v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V8,   v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_cl,    IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz,      IMAGE_GATHER4_C_LZ_V4_V4,   v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_o,         IMAGE_GATHER4_O_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V4,   v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V8,   v8i32>;
+def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl_o,    IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_lz_o,      IMAGE_GATHER4_LZ_O_V4_V4,   v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl_o,    IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_l_o,     IMAGE_GATHER4_C_L_O_V4_V8,  v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_o,     IMAGE_GATHER4_C_B_O_V4_V8,  v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_cl_o,  IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>;
+
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>;
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>;
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
+
+def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>;
+defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">;
+defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">;
+
+/* SIsample for simple 1D texture lookup */
+def : Pat <
+  (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
+  (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+>;
+
+class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
+    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+>;
+
+class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT),
+    (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+>;
+
+class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY),
+    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+>;
+
+class SampleShadowPattern<SDNode name, MIMG opcode,
+                          ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW),
+    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+>;
+
+class SampleShadowArrayPattern<SDNode name, MIMG opcode,
+                               ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
+    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+>;
+
+/* SIsample* for texture lookups consuming more address parameters */
+multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
+                          MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
+MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
+  def : SamplePattern <SIsample, sample, addr_type>;
+  def : SampleRectPattern <SIsample, sample, addr_type>;
+  def : SampleArrayPattern <SIsample, sample, addr_type>;
+  def : SampleShadowPattern <SIsample, sample_c, addr_type>;
+  def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
+
+  def : SamplePattern <SIsamplel, sample_l, addr_type>;
+  def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
+  def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
+  def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
+
+  def : SamplePattern <SIsampleb, sample_b, addr_type>;
+  def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
+  def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
+  def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
+
+  def : SamplePattern <SIsampled, sample_d, addr_type>;
+  def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
+  def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
+  def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
+}
+
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2,
+                      IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2,
+                      IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2,
+                      IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2,
+                      v2i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4,
+                      IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4,
+                      IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4,
+                      IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4,
+                      v4i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8,
+                      IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8,
+                      IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8,
+                      IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8,
+                      v8i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
+                      IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16,
+                      IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16,
+                      IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
+                      v16i32>;
+
+/* int_SI_imageload for texture fetches consuming varying address parameters */
+class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, imm),
+    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY),
+    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+class ImageLoadMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA),
+    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+class ImageLoadArrayMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA),
+    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+multiclass ImageLoadPatterns<MIMG opcode, ValueType addr_type> {
+  def : ImageLoadPattern <int_SI_imageload, opcode, addr_type>;
+  def : ImageLoadArrayPattern <int_SI_imageload, opcode, addr_type>;
+}
+
+multiclass ImageLoadMSAAPatterns<MIMG opcode, ValueType addr_type> {
+  def : ImageLoadMSAAPattern <int_SI_imageload, opcode, addr_type>;
+  def : ImageLoadArrayMSAAPattern <int_SI_imageload, opcode, addr_type>;
+}
+
+defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V2, v2i32>;
+defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V4, v4i32>;
+
+defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V2, v2i32>;
+defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V4, v4i32>;
+
+/* Image resource information */
+def : Pat <
+  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm),
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+>;
+
+def : Pat <
+  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY),
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+>;
+
+def : Pat <
+  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA),
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+>;
+
+/********** ============================================ **********/
+/********** Extraction, Insertion, Building and Casting  **********/
+/********** ============================================ **********/
+
+foreach Index = 0-2 in {
+  def Extract_Element_v2i32_#Index : Extract_Element <
+    i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v2i32_#Index : Insert_Element <
+    i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v2f32_#Index : Extract_Element <
+    f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v2f32_#Index : Insert_Element <
+    f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-3 in {
+  def Extract_Element_v4i32_#Index : Extract_Element <
+    i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v4i32_#Index : Insert_Element <
+    i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v4f32_#Index : Extract_Element <
+    f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v4f32_#Index : Insert_Element <
+    f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-7 in {
+  def Extract_Element_v8i32_#Index : Extract_Element <
+    i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v8i32_#Index : Insert_Element <
+    i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v8f32_#Index : Extract_Element <
+    f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v8f32_#Index : Insert_Element <
+    f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-15 in {
+  def Extract_Element_v16i32_#Index : Extract_Element <
+    i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v16i32_#Index : Insert_Element <
+    i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v16f32_#Index : Extract_Element <
+    f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v16f32_#Index : Insert_Element <
+    f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+def : BitConvert <i32, f32, SReg_32>;
+def : BitConvert <i32, f32, VGPR_32>;
+
+def : BitConvert <f32, i32, SReg_32>;
+def : BitConvert <f32, i32, VGPR_32>;
+
+def : BitConvert <i64, f64, VReg_64>;
+
+def : BitConvert <f64, i64, VReg_64>;
+
+def : BitConvert <v2f32, v2i32, VReg_64>;
+def : BitConvert <v2i32, v2f32, VReg_64>;
+def : BitConvert <v2i32, i64, VReg_64>;
+def : BitConvert <i64, v2i32, VReg_64>;
+def : BitConvert <v2f32, i64, VReg_64>;
+def : BitConvert <i64, v2f32, VReg_64>;
+def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <f64, v2i32, VReg_64>;
+def : BitConvert <v4f32, v4i32, VReg_128>;
+def : BitConvert <v4i32, v4f32, VReg_128>;
+
+def : BitConvert <v8f32, v8i32, SReg_256>;
+def : BitConvert <v8i32, v8f32, SReg_256>;
+def : BitConvert <v8i32, v32i8, SReg_256>;
+def : BitConvert <v32i8, v8i32, SReg_256>;
+def : BitConvert <v8i32, v32i8, VReg_256>;
+def : BitConvert <v8i32, v8f32, VReg_256>;
+def : BitConvert <v8f32, v8i32, VReg_256>;
+def : BitConvert <v32i8, v8i32, VReg_256>;
+
+def : BitConvert <v16i32, v16f32, VReg_512>;
+def : BitConvert <v16f32, v16i32, VReg_512>;
+
+/********** =================== **********/
+/********** Src & Dst modifiers **********/
+/********** =================== **********/
+
+def : Pat <
+  (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
+               (f32 FP_ZERO), (f32 FP_ONE)),
+  (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod)
+>;
+
+/********** ================================ **********/
+/********** Floating point absolute/negative **********/
+/********** ================================ **********/
+
+// Prevent expanding both fneg and fabs.
+
+// FIXME: Should use S_OR_B32
+def : Pat <
+  (fneg (fabs f32:$src)),
+  (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
+>;
+
+// FIXME: Should use S_OR_B32
+def : Pat <
+  (fneg (fabs f64:$src)),
+  (REG_SEQUENCE VReg_64,
+    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+    sub0,
+    (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
+                  (V_MOV_B32_e32 0x80000000)), // Set sign bit.
+    sub1)
+>;
+
+def : Pat <
+  (fabs f32:$src),
+  (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff))
+>;
+
+def : Pat <
+  (fneg f32:$src),
+  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000))
+>;
+
+def : Pat <
+  (fabs f64:$src),
+  (REG_SEQUENCE VReg_64,
+    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+    sub0,
+    (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
+                   (V_MOV_B32_e32 0x7fffffff)), // Set sign bit.
+     sub1)
+>;
+
+def : Pat <
+  (fneg f64:$src),
+  (REG_SEQUENCE VReg_64,
+    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+    sub0,
+    (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
+                   (V_MOV_B32_e32 0x80000000)),
+    sub1)
+>;
+
+/********** ================== **********/
+/********** Immediate Patterns **********/
+/********** ================== **********/
+
+def : Pat <
+  (SGPRImm<(i32 imm)>:$imm),
+  (S_MOV_B32 imm:$imm)
+>;
+
+def : Pat <
+  (SGPRImm<(f32 fpimm)>:$imm),
+  (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : Pat <
+  (i32 imm:$imm),
+  (V_MOV_B32_e32 imm:$imm)
+>;
+
+def : Pat <
+  (f32 fpimm:$imm),
+  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : Pat <
+  (i64 InlineImm<i64>:$imm),
+  (S_MOV_B64 InlineImm<i64>:$imm)
+>;
+
+// XXX - Should this use a s_cmp to set SCC?
+
+// Set to sign-extended 64-bit value (true = -1, false = 0)
+def : Pat <
+  (i1 imm:$imm),
+  (S_MOV_B64 (i64 (as_i64imm $imm)))
+>;
+
+def : Pat <
+  (f64 InlineFPImm<f64>:$imm),
+  (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm)))
+>;
+
+/********** ================== **********/
+/********** Intrinsic Patterns **********/
+/********** ================== **********/
+
+/* llvm.AMDGPU.pow */
+def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
+
+def : Pat <
+  (int_AMDGPU_div f32:$src0, f32:$src1),
+  (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1))
+>;
+
+def : Pat <
+  (int_AMDGPU_cube v4f32:$src),
+  (REG_SEQUENCE VReg_128,
+    (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
+                  0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1),
+                  0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2),
+                  0 /* clamp */, 0 /* omod */), sub0,
+    (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
+                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
+                  0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2),
+                  0 /* clamp */, 0 /* omod */), sub1,
+    (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
+                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
+                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
+                  0 /* clamp */, 0 /* omod */), sub2,
+    (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
+                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
+                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
+                  0 /* clamp */, 0 /* omod */), sub3)
+>;
+
+def : Pat <
+  (i32 (sext i1:$src0)),
+  (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
+>;
+
+class Ext32Pat <SDNode ext> : Pat <
+  (i32 (ext i1:$src0)),
+  (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0)
+>;
+
+def : Ext32Pat <zext>;
+def : Ext32Pat <anyext>;
+
+// Offset in an 32Bit VGPR
+def : Pat <
+  (SIload_constant v4i32:$sbase, i32:$voff),
+  (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0)
+>;
+
+// The multiplication scales from [0,1] to the unsigned integer range
+def : Pat <
+  (AMDGPUurecip i32:$src0),
+  (V_CVT_U32_F32_e32
+    (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1,
+                   (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
+>;
+
+def : Pat <
+  (int_SI_tid),
+  (V_MBCNT_HI_U32_B32_e64 0xffffffff,
+                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0))
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP3 Patterns
+//===----------------------------------------------------------------------===//
+
+def : IMad24Pat<V_MAD_I32_I24>;
+def : UMad24Pat<V_MAD_U32_U24>;
+
+def : Pat <
+  (mulhu i32:$src0, i32:$src1),
+  (V_MUL_HI_U32 $src0, $src1)
+>;
+
+def : Pat <
+  (mulhs i32:$src0, i32:$src1),
+  (V_MUL_HI_I32 $src0, $src1)
+>;
+
+defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
+def : ROTRPattern <V_ALIGNBIT_B32>;
+
+/********** ======================= **********/
+/**********   Load/Store Patterns   **********/
+/********** ======================= **********/
+
+class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
+  (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
+  (inst $ptr, (as_i16imm $offset), (i1 0))
+>;
+
+def : DSReadPat <DS_READ_I8,  i32, si_sextload_local_i8>;
+def : DSReadPat <DS_READ_U8,  i32, si_az_extload_local_i8>;
+def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
+def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>;
+def : DSReadPat <DS_READ_B32, i32, si_load_local>;
+
+let AddedComplexity = 100 in {
+
+def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>;
+
+} // End AddedComplexity = 100
+
+def : Pat <
+  (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+                                                    i8:$offset1))),
+  (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
+>;
+
+class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
+  (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
+  (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+>;
+
+def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>;
+def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>;
+def : DSWritePat <DS_WRITE_B32, i32, si_store_local>;
+
+let AddedComplexity = 100 in {
+
+def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>;
+} // End AddedComplexity = 100
+
+def : Pat <
+  (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+                                                               i8:$offset1)),
+  (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0),
+                       (EXTRACT_SUBREG $value, sub1), $offset0, $offset1,
+                       (i1 0))
+>;
+
+class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
+  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
+  (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+>;
+
+// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
+//
+// We need to use something for the data0, so we set a register to
+// -1. For the non-rtn variants, the manual says it does
+// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max
+// will always do the increment so I'm assuming it's the same.
+//
+// We also load this -1 with s_mov_b32 / s_mov_b64 even though this
+// needs to be a VGPR. The SGPR copy pass will fix this, and it's
+// easier since there is no v_mov_b64.
+class DSAtomicIncRetPat<DS inst, ValueType vt,
+                        Instruction LoadImm, PatFrag frag> : Pat <
+  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
+  (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0))
+>;
+
+
+class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
+  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
+  (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
+>;
+
+
+// 32-bit atomics.
+def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
+                        S_MOV_B32, si_atomic_load_add_local>;
+def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
+                        S_MOV_B32, si_atomic_load_sub_local>;
+
+def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
+def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
+def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>;
+def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>;
+def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>;
+
+def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
+
+// 64-bit atomics.
+def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
+                        S_MOV_B64, si_atomic_load_add_local>;
+def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
+                        S_MOV_B64, si_atomic_load_sub_local>;
+
+def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
+def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
+def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>;
+def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>;
+def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>;
+
+def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>;
+
+
+//===----------------------------------------------------------------------===//
+// MUBUF Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
+                              PatFrag constant_ld> {
+  def : Pat <
+     (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                   i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
+     (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+  >;
+}
+
+let Predicates = [isSICI] in {
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>;
+} // End Predicates = [isSICI]
+
+class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
+  (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
+                        i32:$soffset, u16imm:$offset))),
+  (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+>;
+
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
+
+// BUFFER_LOAD_DWORD*, addr64=0
+multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen,
+                             MUBUF bothen> {
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset,
+                                  imm:$offset, 0, 0, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
+            (as_i1imm $slc), (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
+                                  imm:$offset, 1, 0, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
+           (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
+                                  imm:$offset, 0, 1, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
+           (as_i1imm $slc), (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
+                                  imm:$offset, 1, 1, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
+            (as_i1imm $tfe))
+  >;
+}
+
+defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN,
+                         BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>;
+defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN,
+                         BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>;
+defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
+                         BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
+
+class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
+  (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
+                               u16imm:$offset)),
+  (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+>;
+
+def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
+
+/*
+class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
+  (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)),
+  (Instr $value, $srsrc, $vaddr, $offset)
+>;
+
+let Predicates = [isSICI] in {
+def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>;
+} // End Predicates = [isSICI]
+
+*/
+
+//===----------------------------------------------------------------------===//
+// MTBUF Patterns
+//===----------------------------------------------------------------------===//
+
+// TBUFFER_STORE_FORMAT_*, addr64=0
+class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat<
+  (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
+                   i32:$soffset, imm:$inst_offset, imm:$dfmt,
+                   imm:$nfmt, imm:$offen, imm:$idxen,
+                   imm:$glc, imm:$slc, imm:$tfe),
+  (opcode
+    $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
+    (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
+    (as_i1imm $slc), (as_i1imm $tfe), $soffset)
+>;
+
+def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
+def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
+def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
+def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
+
+let SubtargetPredicate = isCI in {
+
+defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
+  VOP_I32_I32_I32
+>;
+defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
+  VOP_I32_I32_I32
+>;
+defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
+  VOP_I32_I32_I32
+>;
+
+let isCommutable = 1 in {
+defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
+  VOP_I64_I32_I32_I64
+>;
+
+// XXX - Does this set VCC?
+defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
+  VOP_I64_I32_I32_I64
+>;
+} // End isCommutable = 1
+
+// Remaining instructions:
+// FLAT_*
+// S_CBRANCH_CDBGUSER
+// S_CBRANCH_CDBGSYS
+// S_CBRANCH_CDBGSYS_OR_USER
+// S_CBRANCH_CDBGSYS_AND_USER
+// S_DCACHE_INV_VOL
+// DS_NOP
+// DS_GWS_SEMA_RELEASE_ALL
+// DS_WRAP_RTN_B32
+// DS_CNDXCHG32_RTN_B64
+// DS_WRITE_B96
+// DS_WRITE_B128
+// DS_CONDXCHG32_RTN_B128
+// DS_READ_B96
+// DS_READ_B128
+// BUFFER_LOAD_DWORDX3
+// BUFFER_STORE_DWORDX3
+
+} // End isCI
+
+/********** ====================== **********/
+/**********   Indirect adressing   **********/
+/********** ====================== **********/
+
+multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST IndDst> {
+
+  // 1. Extract with offset
+  def : Pat<
+    (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))),
+    (SI_INDIRECT_SRC $vec, $idx, imm:$off)
+  >;
+
+  // 2. Extract without offset
+  def : Pat<
+    (eltvt (vector_extract vt:$vec, i32:$idx)),
+    (SI_INDIRECT_SRC $vec, $idx, 0)
+  >;
+
+  // 3. Insert with offset
+  def : Pat<
+    (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
+    (IndDst $vec, $idx, imm:$off, $val)
+  >;
+
+  // 4. Insert without offset
+  def : Pat<
+    (vector_insert vt:$vec, eltvt:$val, i32:$idx),
+    (IndDst $vec, $idx, 0, $val)
+  >;
+}
+
+defm : SI_INDIRECT_Pattern <v2f32, f32, SI_INDIRECT_DST_V2>;
+defm : SI_INDIRECT_Pattern <v4f32, f32, SI_INDIRECT_DST_V4>;
+defm : SI_INDIRECT_Pattern <v8f32, f32, SI_INDIRECT_DST_V8>;
+defm : SI_INDIRECT_Pattern <v16f32, f32, SI_INDIRECT_DST_V16>;
+
+defm : SI_INDIRECT_Pattern <v2i32, i32, SI_INDIRECT_DST_V2>;
+defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>;
+defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>;
+defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>;
+
+//===----------------------------------------------------------------------===//
+// Conversion Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+  (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16
+
+// Handle sext_inreg in i64
+def : Pat <
+  (i64 (sext_inreg i64:$src, i1)),
+  (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16
+>;
+
+def : Pat <
+  (i64 (sext_inreg i64:$src, i8)),
+  (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16
+>;
+
+def : Pat <
+  (i64 (sext_inreg i64:$src, i16)),
+  (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16
+>;
+
+def : Pat <
+  (i64 (sext_inreg i64:$src, i32)),
+  (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16
+>;
+
+class ZExt_i64_i32_Pat <SDNode ext> : Pat <
+  (i64 (ext i32:$src)),
+  (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1)
+>;
+
+class ZExt_i64_i1_Pat <SDNode ext> : Pat <
+  (i64 (ext i1:$src)),
+    (REG_SEQUENCE VReg_64,
+      (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
+      (S_MOV_B32 0), sub1)
+>;
+
+
+def : ZExt_i64_i32_Pat<zext>;
+def : ZExt_i64_i32_Pat<anyext>;
+def : ZExt_i64_i1_Pat<zext>;
+def : ZExt_i64_i1_Pat<anyext>;
+
+def : Pat <
+  (i64 (sext i32:$src)),
+    (REG_SEQUENCE SReg_64, $src, sub0,
+    (S_ASHR_I32 $src, 31), sub1)
+>;
+
+def : Pat <
+  (i64 (sext i1:$src)),
+  (REG_SEQUENCE VReg_64,
+    (V_CNDMASK_B32_e64 0, -1, $src), sub0,
+    (V_CNDMASK_B32_e64 0, -1, $src), sub1)
+>;
+
+// If we need to perform a logical operation on i1 values, we need to
+// use vector comparisons since there is only one SCC register. Vector
+// comparisions still write to a pair of SGPRs, so treat these as
+// 64-bit comparisons. When legalizing SGPR copies, instructions
+// resulting in the copies from SCC to these instructions will be
+// moved to the VALU.
+def : Pat <
+  (i1 (and i1:$src0, i1:$src1)),
+  (S_AND_B64 $src0, $src1)
+>;
+
+def : Pat <
+  (i1 (or i1:$src0, i1:$src1)),
+  (S_OR_B64 $src0, $src1)
+>;
+
+def : Pat <
+  (i1 (xor i1:$src0, i1:$src1)),
+  (S_XOR_B64 $src0, $src1)
+>;
+
+def : Pat <
+  (f32 (sint_to_fp i1:$src)),
+  (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
+>;
+
+def : Pat <
+  (f32 (uint_to_fp i1:$src)),
+  (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src)
+>;
+
+def : Pat <
+  (f64 (sint_to_fp i1:$src)),
+  (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
+>;
+
+def : Pat <
+  (f64 (uint_to_fp i1:$src)),
+  (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
+>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (i32 (trunc i64:$a)),
+  (EXTRACT_SUBREG $a, sub0)
+>;
+
+def : Pat <
+  (i1 (trunc i32:$a)),
+  (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1)
+>;
+
+def : Pat <
+  (i1 (trunc i64:$a)),
+  (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1),
+                    (EXTRACT_SUBREG $a, sub0)), 1)
+>;
+
+def : Pat <
+  (i32 (bswap i32:$a)),
+  (V_BFI_B32 (S_MOV_B32 0x00ff00ff),
+             (V_ALIGNBIT_B32 $a, $a, 24),
+             (V_ALIGNBIT_B32 $a, $a, 8))
+>;
+
+def : Pat <
+  (f32 (select i1:$src2, f32:$src1, f32:$src0)),
+  (V_CNDMASK_B32_e64 $src0, $src1, $src2)
+>;
+
+multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
+  def : Pat <
+    (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
+    (BFM $a, $b)
+  >;
+
+  def : Pat <
+    (vt (add (vt (shl 1, vt:$a)), -1)),
+    (BFM $a, (MOV 0))
+  >;
+}
+
+defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
+// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
+
+def : BFEPattern <V_BFE_U32, S_MOV_B32>;
+
+//===----------------------------------------------------------------------===//
+// Fract Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isSI] in {
+
+// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
+// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
+// way to implement it is using V_FRACT_F64.
+// The workaround for the V_FRACT bug is:
+//    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
+
+// Convert (x + (-floor(x)) to fract(x)
+def : Pat <
+  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
+             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
+  (V_CNDMASK_B64_PSEUDO
+      $x,
+      (V_MIN_F64
+          SRCMODS.NONE,
+          (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
+          SRCMODS.NONE,
+          (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
+          DSTCLAMP.NONE, DSTOMOD.NONE),
+      (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/))
+>;
+
+// Convert floor(x) to (x - fract(x))
+def : Pat <
+  (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
+  (V_ADD_F64
+      $mods,
+      $x,
+      SRCMODS.NEG,
+      (V_CNDMASK_B64_PSEUDO
+         $x,
+         (V_MIN_F64
+             SRCMODS.NONE,
+             (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
+             SRCMODS.NONE,
+             (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
+             DSTCLAMP.NONE, DSTOMOD.NONE),
+         (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)),
+      DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+} // End Predicates = [isSI]
+
+let Predicates = [isCI] in {
+
+// Convert (x - floor(x)) to fract(x)
+def : Pat <
+  (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
+             (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
+  (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+// Convert (x + (-floor(x))) to fract(x)
+def : Pat <
+  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
+             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
+  (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+} // End Predicates = [isCI]
+
+//============================================================================//
+// Miscellaneous Optimization Patterns
+//============================================================================//
+
+def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
+
+//============================================================================//
+// Assembler aliases
+//============================================================================//
+
+def : MnemonicAlias<"v_add_u32", "v_add_i32">;
+def : MnemonicAlias<"v_sub_u32", "v_sub_i32">;
+def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">;
+
+} // End isGCN predicate
diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td
new file mode 100644
index 00000000000..027a0a2f516
--- /dev/null
+++ b/lib/Target/AMDGPU/SIIntrinsics.td
@@ -0,0 +1,199 @@
+//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// SI Intrinsic Definitions
+//
+//===----------------------------------------------------------------------===//
+
+
+let TargetPrefix = "SI", isTarget = 1 in {
+
+  def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
+  def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
+  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
+
+  // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
+  def int_SI_tbuffer_store : Intrinsic <
+    [],
+    [llvm_anyint_ty, // rsrc(SGPR)
+     llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+     llvm_i32_ty,    // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW
+     llvm_i32_ty,    // vaddr(VGPR)
+     llvm_i32_ty,    // soffset(SGPR)
+     llvm_i32_ty,    // inst_offset(imm)
+     llvm_i32_ty,    // dfmt(imm)
+     llvm_i32_ty,    // nfmt(imm)
+     llvm_i32_ty,    // offen(imm)
+     llvm_i32_ty,    // idxen(imm)
+     llvm_i32_ty,    // glc(imm)
+     llvm_i32_ty,    // slc(imm)
+     llvm_i32_ty],   // tfe(imm)
+    []>;
+
+  // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed
+  def int_SI_buffer_load_dword : Intrinsic <
+    [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+    [llvm_anyint_ty,  // rsrc(SGPR)
+     llvm_anyint_ty,  // vaddr(VGPR)
+     llvm_i32_ty,     // soffset(SGPR)
+     llvm_i32_ty,     // inst_offset(imm)
+     llvm_i32_ty,     // offen(imm)
+     llvm_i32_ty,     // idxen(imm)
+     llvm_i32_ty,     // glc(imm)
+     llvm_i32_ty,     // slc(imm)
+     llvm_i32_ty],    // tfe(imm)
+    [IntrReadArgMem]>;
+
+  def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  // Fully-flexible SAMPLE instruction.
+  class SampleRaw : Intrinsic <
+    [llvm_v4f32_ty],    // vdata(VGPR)
+    [llvm_anyint_ty,    // vaddr(VGPR)
+     llvm_v8i32_ty,     // rsrc(SGPR)
+     llvm_v4i32_ty,     // sampler(SGPR)
+     llvm_i32_ty,       // dmask(imm)
+     llvm_i32_ty,       // unorm(imm)
+     llvm_i32_ty,       // r128(imm)
+     llvm_i32_ty,       // da(imm)
+     llvm_i32_ty,       // glc(imm)
+     llvm_i32_ty,       // slc(imm)
+     llvm_i32_ty,       // tfe(imm)
+     llvm_i32_ty],      // lwe(imm)
+    [IntrNoMem]>;
+
+  // Image instruction without a sampler.
+  class Image : Intrinsic <
+    [llvm_v4f32_ty],    // vdata(VGPR)
+    [llvm_anyint_ty,    // vaddr(VGPR)
+     llvm_v8i32_ty,     // rsrc(SGPR)
+     llvm_i32_ty,       // dmask(imm)
+     llvm_i32_ty,       // unorm(imm)
+     llvm_i32_ty,       // r128(imm)
+     llvm_i32_ty,       // da(imm)
+     llvm_i32_ty,       // glc(imm)
+     llvm_i32_ty,       // slc(imm)
+     llvm_i32_ty,       // tfe(imm)
+     llvm_i32_ty],      // lwe(imm)
+    [IntrNoMem]>;
+
+  // Basic sample
+  def int_SI_image_sample : SampleRaw;
+  def int_SI_image_sample_cl : SampleRaw;
+  def int_SI_image_sample_d : SampleRaw;
+  def int_SI_image_sample_d_cl : SampleRaw;
+  def int_SI_image_sample_l : SampleRaw;
+  def int_SI_image_sample_b : SampleRaw;
+  def int_SI_image_sample_b_cl : SampleRaw;
+  def int_SI_image_sample_lz : SampleRaw;
+  def int_SI_image_sample_cd : SampleRaw;
+  def int_SI_image_sample_cd_cl : SampleRaw;
+
+  // Sample with comparison
+  def int_SI_image_sample_c : SampleRaw;
+  def int_SI_image_sample_c_cl : SampleRaw;
+  def int_SI_image_sample_c_d : SampleRaw;
+  def int_SI_image_sample_c_d_cl : SampleRaw;
+  def int_SI_image_sample_c_l : SampleRaw;
+  def int_SI_image_sample_c_b : SampleRaw;
+  def int_SI_image_sample_c_b_cl : SampleRaw;
+  def int_SI_image_sample_c_lz : SampleRaw;
+  def int_SI_image_sample_c_cd : SampleRaw;
+  def int_SI_image_sample_c_cd_cl : SampleRaw;
+
+  // Sample with offsets
+  def int_SI_image_sample_o : SampleRaw;
+  def int_SI_image_sample_cl_o : SampleRaw;
+  def int_SI_image_sample_d_o : SampleRaw;
+  def int_SI_image_sample_d_cl_o : SampleRaw;
+  def int_SI_image_sample_l_o : SampleRaw;
+  def int_SI_image_sample_b_o : SampleRaw;
+  def int_SI_image_sample_b_cl_o : SampleRaw;
+  def int_SI_image_sample_lz_o : SampleRaw;
+  def int_SI_image_sample_cd_o : SampleRaw;
+  def int_SI_image_sample_cd_cl_o : SampleRaw;
+
+  // Sample with comparison and offsets
+  def int_SI_image_sample_c_o : SampleRaw;
+  def int_SI_image_sample_c_cl_o : SampleRaw;
+  def int_SI_image_sample_c_d_o : SampleRaw;
+  def int_SI_image_sample_c_d_cl_o : SampleRaw;
+  def int_SI_image_sample_c_l_o : SampleRaw;
+  def int_SI_image_sample_c_b_o : SampleRaw;
+  def int_SI_image_sample_c_b_cl_o : SampleRaw;
+  def int_SI_image_sample_c_lz_o : SampleRaw;
+  def int_SI_image_sample_c_cd_o : SampleRaw;
+  def int_SI_image_sample_c_cd_cl_o : SampleRaw;
+
+  // Basic gather4
+  def int_SI_gather4 : SampleRaw;
+  def int_SI_gather4_cl : SampleRaw;
+  def int_SI_gather4_l : SampleRaw;
+  def int_SI_gather4_b : SampleRaw;
+  def int_SI_gather4_b_cl : SampleRaw;
+  def int_SI_gather4_lz : SampleRaw;
+
+  // Gather4 with comparison
+  def int_SI_gather4_c : SampleRaw;
+  def int_SI_gather4_c_cl : SampleRaw;
+  def int_SI_gather4_c_l : SampleRaw;
+  def int_SI_gather4_c_b : SampleRaw;
+  def int_SI_gather4_c_b_cl : SampleRaw;
+  def int_SI_gather4_c_lz : SampleRaw;
+
+  // Gather4 with offsets
+  def int_SI_gather4_o : SampleRaw;
+  def int_SI_gather4_cl_o : SampleRaw;
+  def int_SI_gather4_l_o : SampleRaw;
+  def int_SI_gather4_b_o : SampleRaw;
+  def int_SI_gather4_b_cl_o : SampleRaw;
+  def int_SI_gather4_lz_o : SampleRaw;
+
+  // Gather4 with comparison and offsets
+  def int_SI_gather4_c_o : SampleRaw;
+  def int_SI_gather4_c_cl_o : SampleRaw;
+  def int_SI_gather4_c_l_o : SampleRaw;
+  def int_SI_gather4_c_b_o : SampleRaw;
+  def int_SI_gather4_c_b_cl_o : SampleRaw;
+  def int_SI_gather4_c_lz_o : SampleRaw;
+
+  def int_SI_getlod : SampleRaw;
+
+  // Image instrinsics.
+  def int_SI_image_load : Image;
+  def int_SI_image_load_mip : Image;
+  def int_SI_getresinfo : Image;
+
+  // Deprecated image and sample intrinsics.
+  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  def int_SI_sample : Sample;
+  def int_SI_sampleb : Sample;
+  def int_SI_sampled : Sample;
+  def int_SI_samplel : Sample;
+  def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  /* Interpolation Intrinsics */
+
+  def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>;
+
+  /* Control flow Intrinsics */
+
+  def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
+  def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>;
+  def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
+  def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>;
+  def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>;
+  def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>;
+  def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>;
+}
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
new file mode 100644
index 00000000000..9b1d256dc5a
--- /dev/null
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -0,0 +1,421 @@
+//===-- SILoadStoreOptimizer.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to fuse DS instructions with close by immediate offsets.
+// This will fuse operations such as
+//  ds_read_b32 v0, v2 offset:16
+//  ds_read_b32 v1, v2 offset:32
+// ==>
+//   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
+//
+//
+// Future improvements:
+//
+// - This currently relies on the scheduler to place loads and stores next to
+//   each other, and then only merges adjacent pairs of instructions. It would
+//   be good to be more flexible with interleaved instructions, and possibly run
+//   before scheduling. It currently missing stores of constants because loading
+//   the constant into the data register is placed between the stores, although
+//   this is arguably a scheduling problem.
+//
+// - Live interval recomputing seems inefficient. This currently only matches
+//   one pair, and recomputes live intervals and moves on to the next pair. It
+//   would be better to compute a list of all merges that need to occur
+//
+// - With a list of instructions to process, we can also merge more. If a
+//   cluster of loads have offsets that are too large to fit in the 8-bit
+//   offsets, but are close enough to fit in the 8 bits, we can add to the base
+//   pointer and use the new reduced offsets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-load-store-opt"
+
+namespace {
+
+class SILoadStoreOptimizer : public MachineFunctionPass {
+private:
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  LiveIntervals *LIS;
+
+
+  static bool offsetsCanBeCombined(unsigned Offset0,
+                                   unsigned Offset1,
+                                   unsigned EltSize);
+
+  MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
+                                                 unsigned EltSize);
+
+  void updateRegDefsUses(unsigned SrcReg,
+                         unsigned DstReg,
+                         unsigned SubIdx);
+
+  MachineBasicBlock::iterator mergeRead2Pair(
+    MachineBasicBlock::iterator I,
+    MachineBasicBlock::iterator Paired,
+    unsigned EltSize);
+
+  MachineBasicBlock::iterator mergeWrite2Pair(
+    MachineBasicBlock::iterator I,
+    MachineBasicBlock::iterator Paired,
+    unsigned EltSize);
+
+public:
+  static char ID;
+
+  SILoadStoreOptimizer()
+      : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
+        LIS(nullptr) {}
+
+  SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
+    initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool optimizeBlock(MachineBasicBlock &MBB);
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Load / Store Optimizer";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreserved<LiveVariables>();
+    AU.addRequired<LiveIntervals>();
+
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
+                      "SI Load / Store Optimizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
+                    "SI Load / Store Optimizer", false, false)
+
+char SILoadStoreOptimizer::ID = 0;
+
+char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
+
+FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
+  return new SILoadStoreOptimizer(TM);
+}
+
+bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
+                                                unsigned Offset1,
+                                                unsigned Size) {
+  // XXX - Would the same offset be OK? Is there any reason this would happen or
+  // be useful?
+  if (Offset0 == Offset1)
+    return false;
+
+  // This won't be valid if the offset isn't aligned.
+  if ((Offset0 % Size != 0) || (Offset1 % Size != 0))
+    return false;
+
+  unsigned EltOffset0 = Offset0 / Size;
+  unsigned EltOffset1 = Offset1 / Size;
+
+  // Check if the new offsets fit in the reduced 8-bit range.
+  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1))
+    return true;
+
+  // If the offset in elements doesn't fit in 8-bits, we might be able to use
+  // the stride 64 versions.
+  if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0)
+    return false;
+
+  return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64);
+}
+
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
+                                         unsigned EltSize){
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator MBBI = I;
+  ++MBBI;
+
+  if (MBBI->getOpcode() != I->getOpcode())
+    return E;
+
+  // Don't merge volatiles.
+  if (MBBI->hasOrderedMemoryRef())
+    return E;
+
+  int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
+  const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
+  const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
+
+  // Check same base pointer. Be careful of subregisters, which can occur with
+  // vectors of pointers.
+  if (AddrReg0.getReg() == AddrReg1.getReg() &&
+      AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
+    int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
+                                               AMDGPU::OpName::offset);
+    unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
+    unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+
+    // Check both offsets fit in the reduced range.
+    if (offsetsCanBeCombined(Offset0, Offset1, EltSize))
+      return MBBI;
+  }
+
+  return E;
+}
+
+void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg,
+                                             unsigned DstReg,
+                                             unsigned SubIdx) {
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg),
+         E = MRI->reg_end(); I != E; ) {
+    MachineOperand &O = *I;
+    ++I;
+    O.substVirtReg(DstReg, SubIdx, *TRI);
+  }
+}
+
+MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
+  MachineBasicBlock::iterator I,
+  MachineBasicBlock::iterator Paired,
+  unsigned EltSize) {
+  MachineBasicBlock *MBB = I->getParent();
+
+  // Be careful, since the addresses could be subregisters themselves in weird
+  // cases, like vectors of pointers.
+  const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
+
+  unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg();
+  unsigned DestReg1
+    = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg();
+
+  unsigned Offset0
+          = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
+  unsigned Offset1
+    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
+
+  unsigned NewOffset0 = Offset0 / EltSize;
+  unsigned NewOffset1 = Offset1 / EltSize;
+  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
+
+  // Prefer the st64 form if we can use it, even if we can fit the offset in the
+  // non st64 version. I'm not sure if there's any real reason to do this.
+  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
+  if (UseST64) {
+    NewOffset0 /= 64;
+    NewOffset1 /= 64;
+    Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
+  }
+
+  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
+         (NewOffset0 != NewOffset1) &&
+         "Computed offset doesn't fit");
+
+  const MCInstrDesc &Read2Desc = TII->get(Opc);
+
+  const TargetRegisterClass *SuperRC
+    = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+
+  DebugLoc DL = I->getDebugLoc();
+  MachineInstrBuilder Read2
+    = BuildMI(*MBB, I, DL, Read2Desc, DestReg)
+    .addOperand(*AddrReg) // addr
+    .addImm(NewOffset0) // offset0
+    .addImm(NewOffset1) // offset1
+    .addImm(0) // gds
+    .addMemOperand(*I->memoperands_begin())
+    .addMemOperand(*Paired->memoperands_begin());
+
+  unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
+  unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
+  updateRegDefsUses(DestReg0, DestReg, SubRegIdx0);
+  updateRegDefsUses(DestReg1, DestReg, SubRegIdx1);
+
+  LIS->RemoveMachineInstrFromMaps(I);
+  // Replacing Paired in the maps with Read2 allows us to avoid updating the
+  // live range for the m0 register.
+  LIS->ReplaceMachineInstrInMaps(Paired, Read2);
+  I->eraseFromParent();
+  Paired->eraseFromParent();
+
+  LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
+  LIS->shrinkToUses(&AddrRegLI);
+
+  LIS->getInterval(DestReg); // Create new LI
+
+  DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
+  return Read2.getInstr();
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
+  MachineBasicBlock::iterator I,
+  MachineBasicBlock::iterator Paired,
+  unsigned EltSize) {
+  MachineBasicBlock *MBB = I->getParent();
+
+  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
+  // sure we preserve the subregister index and any register flags set on them.
+  const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
+  const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
+  const MachineOperand *Data1
+    = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
+
+
+  unsigned Offset0
+    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
+  unsigned Offset1
+    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
+
+  unsigned NewOffset0 = Offset0 / EltSize;
+  unsigned NewOffset1 = Offset1 / EltSize;
+  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
+
+  // Prefer the st64 form if we can use it, even if we can fit the offset in the
+  // non st64 version. I'm not sure if there's any real reason to do this.
+  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
+  if (UseST64) {
+    NewOffset0 /= 64;
+    NewOffset1 /= 64;
+    Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
+  }
+
+  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
+         (NewOffset0 != NewOffset1) &&
+         "Computed offset doesn't fit");
+
+  const MCInstrDesc &Write2Desc = TII->get(Opc);
+  DebugLoc DL = I->getDebugLoc();
+
+  // repairLiveintervalsInRange() doesn't handle physical register, so we have
+  // to update the M0 range manually.
+  SlotIndex PairedIndex = LIS->getInstructionIndex(Paired);
+  LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
+  LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
+  bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
+
+  MachineInstrBuilder Write2
+    = BuildMI(*MBB, I, DL, Write2Desc)
+    .addOperand(*Addr) // addr
+    .addOperand(*Data0) // data0
+    .addOperand(*Data1) // data1
+    .addImm(NewOffset0) // offset0
+    .addImm(NewOffset1) // offset1
+    .addImm(0) // gds
+    .addMemOperand(*I->memoperands_begin())
+    .addMemOperand(*Paired->memoperands_begin());
+
+  // XXX - How do we express subregisters here?
+  unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
+
+  LIS->RemoveMachineInstrFromMaps(I);
+  LIS->RemoveMachineInstrFromMaps(Paired);
+  I->eraseFromParent();
+  Paired->eraseFromParent();
+
+  // This doesn't handle physical registers like M0
+  LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
+
+  if (UpdateM0Range) {
+    SlotIndex Write2Index = LIS->getInstructionIndex(Write2);
+    M0Segment->end = Write2Index.getRegSlot();
+  }
+
+  DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
+  return Write2.getInstr();
+}
+
+// Scan through looking for adjacent LDS operations with constant offsets from
+// the same base register. We rely on the scheduler to do the hard work of
+// clustering nearby loads, and assume these are all adjacent.
+bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
+    MachineInstr &MI = *I;
+
+    // Don't combine if volatile.
+    if (MI.hasOrderedMemoryRef()) {
+      ++I;
+      continue;
+    }
+
+    unsigned Opc = MI.getOpcode();
+    if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
+      unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
+      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+      if (Match != E) {
+        Modified = true;
+        I = mergeRead2Pair(I, Match, Size);
+      } else {
+        ++I;
+      }
+
+      continue;
+    } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
+      unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
+      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+      if (Match != E) {
+        Modified = true;
+        I = mergeWrite2Pair(I, Match, Size);
+      } else {
+        ++I;
+      }
+
+      continue;
+    }
+
+    ++I;
+  }
+
+  return Modified;
+}
+
+bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  const TargetSubtargetInfo &STM = MF.getSubtarget();
+  TRI = static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
+  TII = static_cast<const SIInstrInfo *>(STM.getInstrInfo());
+  MRI = &MF.getRegInfo();
+
+  LIS = &getAnalysis<LiveIntervals>();
+
+  DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
+
+  assert(!MRI->isSSA());
+
+  bool Modified = false;
+
+  for (MachineBasicBlock &MBB : MF)
+    Modified |= optimizeBlock(MBB);
+
+  return Modified;
+}
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
new file mode 100644
index 00000000000..c319b32111f
--- /dev/null
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -0,0 +1,605 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass lowers the pseudo control flow instructions to real
+/// machine instructions.
+///
+/// All control flow is handled using predicated instructions and
+/// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
+/// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
+/// by writting to the 64-bit EXEC register (each bit corresponds to a
+/// single vector ALU).  Typically, for predicates, a vector ALU will write
+/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
+/// Vector ALU) and then the ScalarALU will AND the VCC register with the
+/// EXEC to update the predicates.
+///
+/// For example:
+/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
+/// %SGPR0 = SI_IF %VCC
+///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
+/// %SGPR0 = SI_ELSE %SGPR0
+///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
+/// SI_END_CF %SGPR0
+///
+/// becomes:
+///
+/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
+/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
+/// S_CBRANCH_EXECZ label0            // This instruction is an optional
+///                                   // optimization which allows us to
+///                                   // branch if all the bits of
+///                                   // EXEC are zero.
+/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
+///
+/// label0:
+/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
+/// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
+/// S_BRANCH_EXECZ label1              // Use our branch optimization
+///                                    // instruction again.
+/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
+/// label1:
+/// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+
+using namespace llvm;
+
+namespace {
+
+class SILowerControlFlowPass : public MachineFunctionPass {
+
+private:
+  static const unsigned SkipThreshold = 12;
+
+  static char ID;
+  const SIRegisterInfo *TRI;
+  const SIInstrInfo *TII;
+
+  bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
+
+  void Skip(MachineInstr &From, MachineOperand &To);
+  void SkipIfDead(MachineInstr &MI);
+
+  void If(MachineInstr &MI);
+  void Else(MachineInstr &MI);
+  void Break(MachineInstr &MI);
+  void IfBreak(MachineInstr &MI);
+  void ElseBreak(MachineInstr &MI);
+  void Loop(MachineInstr &MI);
+  void EndCf(MachineInstr &MI);
+
+  void Kill(MachineInstr &MI);
+  void Branch(MachineInstr &MI);
+
+  void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
+  void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
+  void IndirectSrc(MachineInstr &MI);
+  void IndirectDst(MachineInstr &MI);
+
+public:
+  SILowerControlFlowPass(TargetMachine &tm) :
+    MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Lower control flow instructions";
+  }
+
+};
+
+} // End anonymous namespace
+
+char SILowerControlFlowPass::ID = 0;
+
+FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
+  return new SILowerControlFlowPass(tm);
+}
+
+bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
+                                        MachineBasicBlock *To) {
+
+  unsigned NumInstr = 0;
+
+  for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
+       MBB = *MBB->succ_begin()) {
+
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+         NumInstr < SkipThreshold && I != E; ++I) {
+
+      if (I->isBundle() || !I->isBundled())
+        if (++NumInstr >= SkipThreshold)
+          return true;
+    }
+  }
+
+  return false;
+}
+
+void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
+
+  if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
+    return;
+
+  DebugLoc DL = From.getDebugLoc();
+  BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+          .addOperand(To)
+          .addReg(AMDGPU::EXEC);
+}
+
+void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
+      ShaderType::PIXEL ||
+      !shouldSkip(&MBB, &MBB.getParent()->back()))
+    return;
+
+  MachineBasicBlock::iterator Insert = &MI;
+  ++Insert;
+
+  // If the exec mask is non-zero, skip the next two instructions
+  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+          .addImm(3)
+          .addReg(AMDGPU::EXEC);
+
+  // Exec mask is zero: Export to NULL target...
+  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
+          .addImm(0)
+          .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+          .addImm(0)
+          .addImm(1)
+          .addImm(1)
+          .addReg(AMDGPU::VGPR0)
+          .addReg(AMDGPU::VGPR0)
+          .addReg(AMDGPU::VGPR0)
+          .addReg(AMDGPU::VGPR0);
+
+  // ... and terminate wavefront
+  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+}
+
+void SILowerControlFlowPass::If(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Reg = MI.getOperand(0).getReg();
+  unsigned Vcc = MI.getOperand(1).getReg();
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
+          .addReg(Vcc);
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Reg);
+
+  Skip(MI, MI.getOperand(2));
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Else(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Src = MI.getOperand(1).getReg();
+
+  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
+          TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
+          .addReg(Src); // Saved EXEC
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Dst);
+
+  Skip(MI, MI.getOperand(2));
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Break(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Src = MI.getOperand(1).getReg();
+ 
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Src);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Vcc = MI.getOperand(1).getReg();
+  unsigned Src = MI.getOperand(2).getReg();
+ 
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+          .addReg(Vcc)
+          .addReg(Src);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Saved = MI.getOperand(1).getReg();
+  unsigned Src = MI.getOperand(2).getReg();
+ 
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+          .addReg(Saved)
+          .addReg(Src);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Loop(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Src = MI.getOperand(0).getReg();
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Src);
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+          .addOperand(MI.getOperand(1))
+          .addReg(AMDGPU::EXEC);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Reg = MI.getOperand(0).getReg();
+
+  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
+          TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Reg);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Branch(MachineInstr &MI) {
+  if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
+    MI.eraseFromParent();
+
+  // If these aren't equal, this is probably an infinite loop.
+}
+
+void SILowerControlFlowPass::Kill(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  const MachineOperand &Op = MI.getOperand(0);
+
+#ifndef NDEBUG
+  const SIMachineFunctionInfo *MFI
+    = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+  // Kill is only allowed in pixel / geometry shaders.
+  assert(MFI->getShaderType() == ShaderType::PIXEL ||
+         MFI->getShaderType() == ShaderType::GEOMETRY);
+#endif
+
+  // Clear this thread from the exec mask if the operand is negative
+  if ((Op.isImm())) {
+    // Constant operand: Set exec mask to 0 or do nothing
+    if (Op.getImm() & 0x80000000) {
+      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+              .addImm(0);
+    }
+  } else {
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
+           .addImm(0)
+           .addOperand(Op);
+  }
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator I = MI;
+
+  unsigned Save = MI.getOperand(1).getReg();
+  unsigned Idx = MI.getOperand(3).getReg();
+
+  if (AMDGPU::SReg_32RegClass.contains(Idx)) {
+    if (Offset) {
+      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+              .addReg(Idx)
+              .addImm(Offset);
+    } else {
+      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+              .addReg(Idx);
+    }
+    MBB.insert(I, MovRel);
+  } else {
+
+    assert(AMDGPU::SReg_64RegClass.contains(Save));
+    assert(AMDGPU::VGPR_32RegClass.contains(Idx));
+
+    // Save the EXEC mask
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+            .addReg(AMDGPU::EXEC);
+
+    // Read the next variant into VCC (lower 32 bits) <- also loop target
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+            AMDGPU::VCC_LO)
+            .addReg(Idx);
+
+    // Move index from VCC into M0
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+            .addReg(AMDGPU::VCC_LO);
+
+    // Compare the just read M0 value to all possible Idx values
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
+            .addReg(AMDGPU::M0)
+            .addReg(Idx);
+
+    // Update EXEC, save the original EXEC value to VCC
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+            .addReg(AMDGPU::VCC);
+
+    if (Offset) {
+      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+              .addReg(AMDGPU::M0)
+              .addImm(Offset);
+    }
+    // Do the actual move
+    MBB.insert(I, MovRel);
+
+    // Update EXEC, switch all done bits to 0 and all todo bits to 1
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+            .addReg(AMDGPU::EXEC)
+            .addReg(AMDGPU::VCC);
+
+    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+            .addImm(-7)
+            .addReg(AMDGPU::EXEC);
+
+    // Restore EXEC
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+            .addReg(Save);
+
+  }
+  MI.eraseFromParent();
+}
+
+/// \param @VecReg The register which holds element zero of the vector
+///                 being addressed into.
+/// \param[out] @Reg The base register to use in the indirect addressing instruction.
+/// \param[in,out] @Offset As an input, this is the constant offset part of the
+//                         indirect Index. e.g. v0 = v[VecReg + Offset]
+//                         As an output, this is a constant value that needs
+//                         to be added to the value stored in M0.
+void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg,
+                                                         unsigned &Reg,
+                                                         int &Offset) {
+  unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
+  if (!SubReg)
+    SubReg = VecReg;
+
+  const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
+  int RegIdx = TRI->getHWRegIndex(SubReg) + Offset;
+
+  if (RegIdx < 0) {
+    Offset = RegIdx;
+    RegIdx = 0;
+  } else {
+    Offset = 0;
+  }
+
+  Reg = RC->getRegister(RegIdx);
+}
+
+void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Vec = MI.getOperand(2).getReg();
+  int Off = MI.getOperand(4).getImm();
+  unsigned Reg;
+
+  computeIndirectRegAndOffset(Vec, Reg, Off);
+
+  MachineInstr *MovRel =
+    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+            .addReg(Reg)
+            .addReg(AMDGPU::M0, RegState::Implicit)
+            .addReg(Vec, RegState::Implicit);
+
+  LoadM0(MI, MovRel, Off);
+}
+
+void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  int Off = MI.getOperand(4).getImm();
+  unsigned Val = MI.getOperand(5).getReg();
+  unsigned Reg;
+
+  computeIndirectRegAndOffset(Dst, Reg, Off);
+
+  MachineInstr *MovRel = 
+    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
+            .addReg(Reg, RegState::Define)
+            .addReg(Val)
+            .addReg(AMDGPU::M0, RegState::Implicit)
+            .addReg(Dst, RegState::Implicit);
+
+  LoadM0(MI, MovRel, Off);
+}
+
+bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI =
+      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  bool HaveKill = false;
+  bool NeedWQM = false;
+  bool NeedFlat = false;
+  unsigned Depth = 0;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+
+      MachineInstr &MI = *I;
+      if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode()))
+        NeedWQM = true;
+
+      // Flat uses m0 in case it needs to access LDS.
+      if (TII->isFLAT(MI.getOpcode()))
+        NeedFlat = true;
+
+      switch (MI.getOpcode()) {
+        default: break;
+        case AMDGPU::SI_IF:
+          ++Depth;
+          If(MI);
+          break;
+
+        case AMDGPU::SI_ELSE:
+          Else(MI);
+          break;
+
+        case AMDGPU::SI_BREAK:
+          Break(MI);
+          break;
+
+        case AMDGPU::SI_IF_BREAK:
+          IfBreak(MI);
+          break;
+
+        case AMDGPU::SI_ELSE_BREAK:
+          ElseBreak(MI);
+          break;
+
+        case AMDGPU::SI_LOOP:
+          ++Depth;
+          Loop(MI);
+          break;
+
+        case AMDGPU::SI_END_CF:
+          if (--Depth == 0 && HaveKill) {
+            SkipIfDead(MI);
+            HaveKill = false;
+          }
+          EndCf(MI);
+          break;
+
+        case AMDGPU::SI_KILL:
+          if (Depth == 0)
+            SkipIfDead(MI);
+          else
+            HaveKill = true;
+          Kill(MI);
+          break;
+
+        case AMDGPU::S_BRANCH:
+          Branch(MI);
+          break;
+
+        case AMDGPU::SI_INDIRECT_SRC:
+          IndirectSrc(MI);
+          break;
+
+        case AMDGPU::SI_INDIRECT_DST_V1:
+        case AMDGPU::SI_INDIRECT_DST_V2:
+        case AMDGPU::SI_INDIRECT_DST_V4:
+        case AMDGPU::SI_INDIRECT_DST_V8:
+        case AMDGPU::SI_INDIRECT_DST_V16:
+          IndirectDst(MI);
+          break;
+      }
+    }
+  }
+
+  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
+    MachineBasicBlock &MBB = MF.front();
+    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
+  }
+
+  // FIXME: This seems inappropriate to do here.
+  if (NeedFlat && MFI->IsKernel) {
+    // Insert the prologue initializing the SGPRs pointing to the scratch space
+    // for flat accesses.
+    const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+
+    // TODO: What to use with function calls?
+
+    // FIXME: This is reporting stack size that is used in a scratch buffer
+    // rather than registers as well.
+    uint64_t StackSizeBytes = FrameInfo->getStackSize();
+
+    int IndirectBegin
+      = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
+    // Convert register index to 256-byte unit.
+    uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
+
+    assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
+           "Stack limits should be smaller than 16-bits");
+
+    // Initialize the flat scratch register pair.
+    // TODO: Can we use one s_mov_b64 here?
+
+    // Offset is in units of 256-bytes.
+    MachineBasicBlock &MBB = MF.front();
+    DebugLoc NoDL;
+    MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
+    const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
+
+    assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
+
+    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
+      .addImm(StackOffset);
+
+    // Documentation says size is "per-thread scratch size in bytes"
+    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
+      .addImm(StackSizeBytes);
+  }
+
+  return true;
+}
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
new file mode 100644
index 00000000000..67421e231d8
--- /dev/null
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -0,0 +1,151 @@
+//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// i1 values are usually inserted by the CFG Structurize pass and they are
+/// unique in that they can be copied from VALU to SALU registers.
+/// This is not possible for any other value type.  Since there are no
+/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
+///
+//===----------------------------------------------------------------------===//
+//
+
+#define DEBUG_TYPE "si-i1-copies"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+class SILowerI1Copies : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SILowerI1Copies() : MachineFunctionPass(ID) {
+    initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Lower i1 Copies";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE,
+                      "SI Lower i1 Copies", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE,
+                    "SI Lower i1 Copies", false, false)
+
+char SILowerI1Copies::ID = 0;
+
+char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;
+
+FunctionPass *llvm::createSILowerI1CopiesPass() {
+  return new SILowerI1Copies();
+}
+
+bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  std::vector<unsigned> I1Defs;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
+        unsigned Reg = MI.getOperand(0).getReg();
+        const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+        if (RC == &AMDGPU::VReg_1RegClass)
+          MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass);
+        continue;
+      }
+
+      if (MI.getOpcode() != AMDGPU::COPY)
+        continue;
+
+      const MachineOperand &Dst = MI.getOperand(0);
+      const MachineOperand &Src = MI.getOperand(1);
+
+      if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
+          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+        continue;
+
+      const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
+      const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
+
+      if (DstRC == &AMDGPU::VReg_1RegClass &&
+          TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
+        I1Defs.push_back(Dst.getReg());
+        DebugLoc DL = MI.getDebugLoc();
+
+        MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
+        if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
+          if (DefInst->getOperand(1).isImm()) {
+            I1Defs.push_back(Dst.getReg());
+
+            int64_t Val = DefInst->getOperand(1).getImm();
+            assert(Val == 0 || Val == -1);
+
+            BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
+              .addOperand(Dst)
+              .addImm(Val);
+            MI.eraseFromParent();
+            continue;
+          }
+        }
+
+        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
+          .addOperand(Dst)
+          .addImm(0)
+          .addImm(-1)
+          .addOperand(Src);
+        MI.eraseFromParent();
+      } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
+                 SrcRC == &AMDGPU::VReg_1RegClass) {
+        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
+          .addOperand(Dst)
+          .addOperand(Src)
+          .addImm(0);
+        MI.eraseFromParent();
+      }
+    }
+  }
+
+  for (unsigned Reg : I1Defs)
+    MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);
+
+  return false;
+}
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
new file mode 100644
index 00000000000..587ea63d679
--- /dev/null
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -0,0 +1,77 @@
+//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+
+#include "SIMachineFunctionInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+
+#define MAX_LANES 64
+
+using namespace llvm;
+
+
+// Pin the vtable to this file.
+void SIMachineFunctionInfo::anchor() {}
+
+SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
+  : AMDGPUMachineFunction(MF),
+    TIDReg(AMDGPU::NoRegister),
+    HasSpilledVGPRs(false),
+    PSInputAddr(0),
+    NumUserSGPRs(0),
+    LDSWaveSpillSize(0) { }
+
+SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
+                                                       MachineFunction *MF,
+                                                       unsigned FrameIndex,
+                                                       unsigned SubIdx) {
+  const MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
+      MF->getSubtarget<AMDGPUSubtarget>().getRegisterInfo());
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  int64_t Offset = FrameInfo->getObjectOffset(FrameIndex);
+  Offset += SubIdx * 4;
+
+  unsigned LaneVGPRIdx = Offset / (64 * 4);
+  unsigned Lane = (Offset / 4) % 64;
+
+  struct SpilledReg Spill;
+
+  if (!LaneVGPRs.count(LaneVGPRIdx)) {
+    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
+    LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
+    MRI.setPhysRegUsed(LaneVGPR);
+
+    // Add this register as live-in to all blocks to avoid machine verifer
+    // complaining about use of an undefined physical register.
+    for (MachineFunction::iterator BI = MF->begin(), BE = MF->end();
+         BI != BE; ++BI) {
+      BI->addLiveIn(LaneVGPR);
+    }
+  }
+
+  Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
+  Spill.Lane = Lane;
+  return Spill;
+}
+
+unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
+                                              const MachineFunction &MF) const {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  // FIXME: We should get this information from kernel attributes if it
+  // is available.
+  return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize();
+}
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
new file mode 100644
index 00000000000..667da4c8af6
--- /dev/null
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -0,0 +1,66 @@
+//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H
+
+#include "AMDGPUMachineFunction.h"
+#include "SIRegisterInfo.h"
+#include <map>
+
+namespace llvm {
+
+class MachineRegisterInfo;
+
+/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
+/// tells the hardware which interpolation parameters to load.
+class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+  void anchor() override;
+
+  unsigned TIDReg;
+  bool HasSpilledVGPRs;
+
+public:
+
+  struct SpilledReg {
+    unsigned VGPR;
+    int Lane;
+    SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { }
+    SpilledReg() : VGPR(0), Lane(-1) { }
+    bool hasLane() { return Lane != -1;}
+  };
+
+  // SIMachineFunctionInfo definition
+
+  SIMachineFunctionInfo(const MachineFunction &MF);
+  SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex,
+                           unsigned SubIdx);
+  unsigned PSInputAddr;
+  unsigned NumUserSGPRs;
+  std::map<unsigned, unsigned> LaneVGPRs;
+  unsigned LDSWaveSpillSize;
+  unsigned ScratchOffsetReg;
+  bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
+  unsigned getTIDReg() const { return TIDReg; };
+  void setTIDReg(unsigned Reg) { TIDReg = Reg; }
+  bool hasSpilledVGPRs() const { return HasSpilledVGPRs; }
+  void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; }
+
+  unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
+};
+
+} // End namespace llvm
+
+
+#endif
diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
new file mode 100644
index 00000000000..0a7f684552f
--- /dev/null
+++ b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
@@ -0,0 +1,194 @@
+//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This pass loads scratch pointer and scratch offset into a register or a
+/// frame index which can be used anywhere in the program.  These values will
+/// be used for spilling VGPRs.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+
+using namespace llvm;
+
+namespace {
+
+class SIPrepareScratchRegs : public MachineFunctionPass {
+
+private:
+  static char ID;
+
+public:
+  SIPrepareScratchRegs() : MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI prepare scratch registers";
+  }
+
+};
+
+} // End anonymous namespace
+
+char SIPrepareScratchRegs::ID = 0;
+
+FunctionPass *llvm::createSIPrepareScratchRegs() {
+  return new SIPrepareScratchRegs();
+}
+
+bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+  MachineBasicBlock *Entry = MF.begin();
+  MachineBasicBlock::iterator I = Entry->begin();
+  DebugLoc DL = I->getDebugLoc();
+
+  // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to
+  // run this pass.
+  if (!MFI->hasSpilledVGPRs())
+    return false;
+
+  unsigned ScratchPtrPreloadReg =
+      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
+  unsigned ScratchOffsetPreloadReg =
+      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+
+  if (!Entry->isLiveIn(ScratchPtrPreloadReg))
+    Entry->addLiveIn(ScratchPtrPreloadReg);
+
+  if (!Entry->isLiveIn(ScratchOffsetPreloadReg))
+    Entry->addLiveIn(ScratchOffsetPreloadReg);
+
+  // Load the scratch offset.
+  unsigned ScratchOffsetReg =
+      TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass);
+  int ScratchOffsetFI = -1;
+
+  if (ScratchOffsetReg != AMDGPU::NoRegister) {
+    // Found an SGPR to use
+    MRI.setPhysRegUsed(ScratchOffsetReg);
+    BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
+            .addReg(ScratchOffsetPreloadReg);
+  } else {
+    // No SGPR is available, we must spill.
+    ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4);
+    BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE))
+            .addReg(ScratchOffsetPreloadReg)
+            .addFrameIndex(ScratchOffsetFI)
+            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
+            .addReg(AMDGPU::SGPR0, RegState::Undef);
+  }
+
+
+  // Now that we have the scratch pointer and offset values, we need to
+  // add them to all the SI_SPILL_V* instructions.
+
+  RegScavenger RS;
+  unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4);
+  RS.addScavengingFrameIndex(ScratchRsrcFI);
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    // Add the scratch offset reg as a live-in so that the register scavenger
+    // doesn't re-use it.
+    if (!MBB.isLiveIn(ScratchOffsetReg) &&
+        ScratchOffsetReg != AMDGPU::NoRegister)
+      MBB.addLiveIn(ScratchOffsetReg);
+    RS.enterBasicBlock(&MBB);
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      MachineInstr &MI = *I;
+      RS.forward(I);
+      DebugLoc DL = MI.getDebugLoc();
+      if (!TII->isVGPRSpill(MI.getOpcode()))
+        continue;
+
+      // Scratch resource
+      unsigned ScratchRsrcReg =
+          RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0);
+
+      uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
+                      0xffffffff; // Size
+
+      unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+      unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+      unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+      unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
+              .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
+              .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
+              .addImm(Rsrc & 0xffffffff)
+              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
+              .addImm(Rsrc >> 32)
+              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+      // Scratch Offset
+      if (ScratchOffsetReg == AMDGPU::NoRegister) {
+        ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE),
+                ScratchOffsetReg)
+                .addFrameIndex(ScratchOffsetFI)
+                .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
+                .addReg(AMDGPU::SGPR0, RegState::Undef);
+      } else if (!MBB.isLiveIn(ScratchOffsetReg)) {
+        MBB.addLiveIn(ScratchOffsetReg);
+      }
+
+      if (ScratchRsrcReg == AMDGPU::NoRegister ||
+          ScratchOffsetReg == AMDGPU::NoRegister) {
+        LLVMContext &Ctx = MF.getFunction()->getContext();
+        Ctx.emitError("ran out of SGPRs for spilling VGPRs");
+        ScratchRsrcReg = AMDGPU::SGPR0;
+        ScratchOffsetReg = AMDGPU::SGPR0;
+      }
+      MI.getOperand(2).setReg(ScratchRsrcReg);
+      MI.getOperand(2).setIsKill(true);
+      MI.getOperand(2).setIsUndef(false);
+      MI.getOperand(3).setReg(ScratchOffsetReg);
+      MI.getOperand(3).setIsUndef(false);
+      MI.getOperand(3).setIsKill(false);
+      MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true));
+      MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true));
+      MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true));
+      MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true));
+    }
+  }
+  return true;
+}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
new file mode 100644
index 00000000000..db2ff0b1f95
--- /dev/null
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -0,0 +1,543 @@
+//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "SIRegisterInfo.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+
+using namespace llvm;
+
+SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {}
+
+BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(AMDGPU::EXEC);
+
+  // EXEC_LO and EXEC_HI could be allocated and used as regular register,
+  // but this seems likely to result in bugs, so I'm marking them as reserved.
+  Reserved.set(AMDGPU::EXEC_LO);
+  Reserved.set(AMDGPU::EXEC_HI);
+
+  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
+  Reserved.set(AMDGPU::FLAT_SCR);
+  Reserved.set(AMDGPU::FLAT_SCR_LO);
+  Reserved.set(AMDGPU::FLAT_SCR_HI);
+
+  // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs
+  Reserved.set(AMDGPU::VGPR255);
+  Reserved.set(AMDGPU::VGPR254);
+
+  // Tonga and Iceland can only allocate a fixed number of SGPRs due
+  // to a hw bug.
+  if (MF.getSubtarget<AMDGPUSubtarget>().hasSGPRInitBug()) {
+    unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+    // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs).
+    // Assume XNACK_MASK is unused.
+    unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4;
+
+    for (unsigned i = Limit; i < NumSGPRs; ++i) {
+      unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
+      MCRegAliasIterator R = MCRegAliasIterator(Reg, this, true);
+
+      for (; R.isValid(); ++R)
+        Reserved.set(*R);
+    }
+  }
+
+  return Reserved;
+}
+
+unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
+                                                unsigned Idx) const {
+
+  const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>();
+  // FIXME: We should adjust the max number of waves based on LDS size.
+  unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(),
+                                          STI.getMaxWavesPerCU());
+  unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU());
+
+  for (regclass_iterator I = regclass_begin(), E = regclass_end();
+       I != E; ++I) {
+
+    unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1);
+    unsigned Limit;
+
+    if (isSGPRClass(*I)) {
+      Limit = SGPRLimit / NumSubRegs;
+    } else {
+      Limit = VGPRLimit / NumSubRegs;
+    }
+
+    const int *Sets = getRegClassPressureSets(*I);
+    assert(Sets);
+    for (unsigned i = 0; Sets[i] != -1; ++i) {
+	    if (Sets[i] == (int)Idx)
+        return Limit;
+    }
+  }
+  return 256;
+}
+
+bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
+  return Fn.getFrameInfo()->hasStackObjects();
+}
+
+static unsigned getNumSubRegsForSpillOp(unsigned Op) {
+
+  switch (Op) {
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+  case AMDGPU::SI_SPILL_V512_SAVE:
+  case AMDGPU::SI_SPILL_V512_RESTORE:
+    return 16;
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+  case AMDGPU::SI_SPILL_V256_SAVE:
+  case AMDGPU::SI_SPILL_V256_RESTORE:
+    return 8;
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+  case AMDGPU::SI_SPILL_V128_SAVE:
+  case AMDGPU::SI_SPILL_V128_RESTORE:
+    return 4;
+  case AMDGPU::SI_SPILL_V96_SAVE:
+  case AMDGPU::SI_SPILL_V96_RESTORE:
+    return 3;
+  case AMDGPU::SI_SPILL_S64_SAVE:
+  case AMDGPU::SI_SPILL_S64_RESTORE:
+  case AMDGPU::SI_SPILL_V64_SAVE:
+  case AMDGPU::SI_SPILL_V64_RESTORE:
+    return 2;
+  case AMDGPU::SI_SPILL_S32_SAVE:
+  case AMDGPU::SI_SPILL_S32_RESTORE:
+  case AMDGPU::SI_SPILL_V32_SAVE:
+  case AMDGPU::SI_SPILL_V32_RESTORE:
+    return 1;
+  default: llvm_unreachable("Invalid spill opcode");
+  }
+}
+
+void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
+                                           unsigned LoadStoreOp,
+                                           unsigned Value,
+                                           unsigned ScratchRsrcReg,
+                                           unsigned ScratchOffset,
+                                           int64_t Offset,
+                                           RegScavenger *RS) const {
+
+  MachineBasicBlock *MBB = MI->getParent();
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
+  LLVMContext &Ctx = MF->getFunction()->getContext();
+  DebugLoc DL = MI->getDebugLoc();
+  bool IsLoad = TII->get(LoadStoreOp).mayLoad();
+
+  bool RanOutOfSGPRs = false;
+  unsigned SOffset = ScratchOffset;
+
+  unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+  unsigned Size = NumSubRegs * 4;
+
+  if (!isUInt<12>(Offset + Size)) {
+    SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
+    if (SOffset == AMDGPU::NoRegister) {
+      RanOutOfSGPRs = true;
+      SOffset = AMDGPU::SGPR0;
+    }
+    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
+            .addReg(ScratchOffset)
+            .addImm(Offset);
+    Offset = 0;
+  }
+
+  if (RanOutOfSGPRs)
+    Ctx.emitError("Ran out of SGPRs for spilling VGPRS");
+
+  for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
+    unsigned SubReg = NumSubRegs > 1 ?
+        getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
+        Value;
+    bool IsKill = (i == e - 1);
+
+    BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
+            .addReg(SubReg, getDefRegState(IsLoad))
+            .addReg(ScratchRsrcReg, getKillRegState(IsKill))
+            .addReg(SOffset)
+            .addImm(Offset)
+            .addImm(0) // glc
+            .addImm(0) // slc
+            .addImm(0) // tfe
+            .addReg(Value, RegState::Implicit | getDefRegState(IsLoad));
+  }
+}
+
+void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                        int SPAdj, unsigned FIOperandNum,
+                                        RegScavenger *RS) const {
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineBasicBlock *MBB = MI->getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
+  DebugLoc DL = MI->getDebugLoc();
+
+  MachineOperand &FIOp = MI->getOperand(FIOperandNum);
+  int Index = MI->getOperand(FIOperandNum).getIndex();
+
+  switch (MI->getOpcode()) {
+    // SGPR register spill
+    case AMDGPU::SI_SPILL_S512_SAVE:
+    case AMDGPU::SI_SPILL_S256_SAVE:
+    case AMDGPU::SI_SPILL_S128_SAVE:
+    case AMDGPU::SI_SPILL_S64_SAVE:
+    case AMDGPU::SI_SPILL_S32_SAVE: {
+      unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+
+      for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+        unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
+                                           &AMDGPU::SGPR_32RegClass, i);
+        struct SIMachineFunctionInfo::SpilledReg Spill =
+            MFI->getSpilledReg(MF, Index, i);
+
+        if (Spill.VGPR == AMDGPU::NoRegister) {
+           LLVMContext &Ctx = MF->getFunction()->getContext();
+           Ctx.emitError("Ran out of VGPRs for spilling SGPR");
+        }
+
+        BuildMI(*MBB, MI, DL,
+                TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+                Spill.VGPR)
+                .addReg(SubReg)
+                .addImm(Spill.Lane);
+
+      }
+      MI->eraseFromParent();
+      break;
+    }
+
+    // SGPR register restore
+    case AMDGPU::SI_SPILL_S512_RESTORE:
+    case AMDGPU::SI_SPILL_S256_RESTORE:
+    case AMDGPU::SI_SPILL_S128_RESTORE:
+    case AMDGPU::SI_SPILL_S64_RESTORE:
+    case AMDGPU::SI_SPILL_S32_RESTORE: {
+      unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+
+      for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+        unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
+                                           &AMDGPU::SGPR_32RegClass, i);
+        struct SIMachineFunctionInfo::SpilledReg Spill =
+            MFI->getSpilledReg(MF, Index, i);
+
+        if (Spill.VGPR == AMDGPU::NoRegister) {
+           LLVMContext &Ctx = MF->getFunction()->getContext();
+           Ctx.emitError("Ran out of VGPRs for spilling SGPR");
+        }
+
+        BuildMI(*MBB, MI, DL,
+                TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+                SubReg)
+                .addReg(Spill.VGPR)
+                .addImm(Spill.Lane)
+                .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+      }
+
+      // TODO: only do this when it is needed
+      switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) {
+      case AMDGPUSubtarget::SOUTHERN_ISLANDS:
+        // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI
+        TII->insertNOPs(MI, 3);
+        break;
+      case AMDGPUSubtarget::SEA_ISLANDS:
+        break;
+      default: // VOLCANIC_ISLANDS and later
+        // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI
+        // and later. This also applies to VALUs which write VCC, but we're
+        // unlikely to see VMEM use VCC.
+        TII->insertNOPs(MI, 4);
+      }
+
+      MI->eraseFromParent();
+      break;
+    }
+
+    // VGPR register spill
+    case AMDGPU::SI_SPILL_V512_SAVE:
+    case AMDGPU::SI_SPILL_V256_SAVE:
+    case AMDGPU::SI_SPILL_V128_SAVE:
+    case AMDGPU::SI_SPILL_V96_SAVE:
+    case AMDGPU::SI_SPILL_V64_SAVE:
+    case AMDGPU::SI_SPILL_V32_SAVE:
+      buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+            TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
+             FrameInfo->getObjectOffset(Index), RS);
+      MI->eraseFromParent();
+      break;
+    case AMDGPU::SI_SPILL_V32_RESTORE:
+    case AMDGPU::SI_SPILL_V64_RESTORE:
+    case AMDGPU::SI_SPILL_V96_RESTORE:
+    case AMDGPU::SI_SPILL_V128_RESTORE:
+    case AMDGPU::SI_SPILL_V256_RESTORE:
+    case AMDGPU::SI_SPILL_V512_RESTORE: {
+      buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+            TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
+            FrameInfo->getObjectOffset(Index), RS);
+      MI->eraseFromParent();
+      break;
+    }
+
+    default: {
+      int64_t Offset = FrameInfo->getObjectOffset(Index);
+      FIOp.ChangeToImmediate(Offset);
+      if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
+        unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj);
+        BuildMI(*MBB, MI, MI->getDebugLoc(),
+                TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+                .addImm(Offset);
+        FIOp.ChangeToRegister(TmpReg, false, false, true);
+      }
+    }
+  }
+}
+
+const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
+                                                                   MVT VT) const {
+  switch(VT.SimpleTy) {
+    default:
+    case MVT::i32: return &AMDGPU::VGPR_32RegClass;
+  }
+}
+
+unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
+  return getEncodingValue(Reg) & 0xff;
+}
+
+const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
+  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+
+  static const TargetRegisterClass *BaseClasses[] = {
+    &AMDGPU::VGPR_32RegClass,
+    &AMDGPU::SReg_32RegClass,
+    &AMDGPU::VReg_64RegClass,
+    &AMDGPU::SReg_64RegClass,
+    &AMDGPU::VReg_96RegClass,
+    &AMDGPU::VReg_128RegClass,
+    &AMDGPU::SReg_128RegClass,
+    &AMDGPU::VReg_256RegClass,
+    &AMDGPU::SReg_256RegClass,
+    &AMDGPU::VReg_512RegClass
+  };
+
+  for (const TargetRegisterClass *BaseClass : BaseClasses) {
+    if (BaseClass->contains(Reg)) {
+      return BaseClass;
+    }
+  }
+  return nullptr;
+}
+
+bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
+  return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_512RegClass, RC);
+}
+
+const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
+                                         const TargetRegisterClass *SRC) const {
+    if (hasVGPRs(SRC)) {
+      return SRC;
+    } else if (SRC == &AMDGPU::SCCRegRegClass) {
+      return &AMDGPU::VCCRegRegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) {
+      return &AMDGPU::VGPR_32RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) {
+      return &AMDGPU::VReg_64RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) {
+      return &AMDGPU::VReg_128RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) {
+      return &AMDGPU::VReg_256RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
+      return &AMDGPU::VReg_512RegClass;
+    }
+    return nullptr;
+}
+
+const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
+                         const TargetRegisterClass *RC, unsigned SubIdx) const {
+  if (SubIdx == AMDGPU::NoSubRegister)
+    return RC;
+
+  // If this register has a sub-register, we can safely assume it is a 32-bit
+  // register, because all of SI's sub-registers are 32-bit.
+  if (isSGPRClass(RC)) {
+    return &AMDGPU::SGPR_32RegClass;
+  } else {
+    return &AMDGPU::VGPR_32RegClass;
+  }
+}
+
+unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
+                                          const TargetRegisterClass *SubRC,
+                                          unsigned Channel) const {
+
+  switch (Reg) {
+    case AMDGPU::VCC:
+      switch(Channel) {
+        case 0: return AMDGPU::VCC_LO;
+        case 1: return AMDGPU::VCC_HI;
+        default: llvm_unreachable("Invalid SubIdx for VCC");
+      }
+
+  case AMDGPU::FLAT_SCR:
+    switch (Channel) {
+    case 0:
+      return AMDGPU::FLAT_SCR_LO;
+    case 1:
+      return AMDGPU::FLAT_SCR_HI;
+    default:
+      llvm_unreachable("Invalid SubIdx for FLAT_SCR");
+    }
+    break;
+
+  case AMDGPU::EXEC:
+    switch (Channel) {
+    case 0:
+      return AMDGPU::EXEC_LO;
+    case 1:
+      return AMDGPU::EXEC_HI;
+    default:
+      llvm_unreachable("Invalid SubIdx for EXEC");
+    }
+    break;
+  }
+
+  const TargetRegisterClass *RC = getPhysRegClass(Reg);
+  // 32-bit registers don't have sub-registers, so we can just return the
+  // Reg.  We need to have this check here, because the calculation below
+  // using getHWRegIndex() will fail with special 32-bit registers like
+  // VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0.
+  if (RC->getSize() == 4) {
+    assert(Channel == 0);
+    return Reg;
+  }
+
+  unsigned Index = getHWRegIndex(Reg);
+  return SubRC->getRegister(Index + Channel);
+}
+
+bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
+  return OpType == AMDGPU::OPERAND_REG_IMM32;
+}
+
+bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
+  if (opCanUseLiteralConstant(OpType))
+    return true;
+
+  return OpType == AMDGPU::OPERAND_REG_INLINE_C;
+}
+
+unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
+                                           enum PreloadedValue Value) const {
+
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  switch (Value) {
+  case SIRegisterInfo::TGID_X:
+    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
+  case SIRegisterInfo::TGID_Y:
+    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
+  case SIRegisterInfo::TGID_Z:
+    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
+  case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
+    if (MFI->getShaderType() != ShaderType::COMPUTE)
+      return MFI->ScratchOffsetReg;
+    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
+  case SIRegisterInfo::SCRATCH_PTR:
+    return AMDGPU::SGPR2_SGPR3;
+  case SIRegisterInfo::INPUT_PTR:
+    return AMDGPU::SGPR0_SGPR1;
+  case SIRegisterInfo::TIDIG_X:
+    return AMDGPU::VGPR0;
+  case SIRegisterInfo::TIDIG_Y:
+    return AMDGPU::VGPR1;
+  case SIRegisterInfo::TIDIG_Z:
+    return AMDGPU::VGPR2;
+  }
+  llvm_unreachable("unexpected preloaded value type");
+}
+
+/// \brief Returns a register that is not used at any point in the function.
+///        If all registers are used, then this function will return
+//         AMDGPU::NoRegister.
+unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
+                                           const TargetRegisterClass *RC) const {
+
+  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
+       I != E; ++I) {
+    if (!MRI.isPhysRegUsed(*I))
+      return *I;
+  }
+  return AMDGPU::NoRegister;
+}
+
+unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
+  switch(WaveCount) {
+    case 10: return 24;
+    case 9:  return 28;
+    case 8:  return 32;
+    case 7:  return 36;
+    case 6:  return 40;
+    case 5:  return 48;
+    case 4:  return 64;
+    case 3:  return 84;
+    case 2:  return 128;
+    default: return 256;
+  }
+}
+
+unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
+                                            unsigned WaveCount) const {
+  if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    switch (WaveCount) {
+      case 10: return 80;
+      case 9:  return 80;
+      case 8:  return 96;
+      default: return 102;
+    }
+  } else {
+    switch(WaveCount) {
+      case 10: return 48;
+      case 9:  return 56;
+      case 8:  return 64;
+      case 7:  return 72;
+      case 6:  return 80;
+      case 5:  return 96;
+      default: return 103;
+    }
+  }
+}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
new file mode 100644
index 00000000000..bfdb67c5e12
--- /dev/null
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -0,0 +1,131 @@
+//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for SIRegisterInfo
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
+#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
+
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/Support/Debug.h"
+
+namespace llvm {
+
+struct SIRegisterInfo : public AMDGPURegisterInfo {
+
+  SIRegisterInfo();
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  unsigned getRegPressureSetLimit(const MachineFunction &MF,
+                                  unsigned Idx) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS) const override;
+
+  /// \brief get the register class of the specified type to use in the
+  /// CFGStructurizer
+  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
+
+  unsigned getHWRegIndex(unsigned Reg) const override;
+
+  /// \brief Return the 'base' register class for this register.
+  /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
+  const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
+
+  /// \returns true if this class contains only SGPR registers
+  bool isSGPRClass(const TargetRegisterClass *RC) const {
+    if (!RC)
+      return false;
+
+    return !hasVGPRs(RC);
+  }
+
+  /// \returns true if this class ID contains only SGPR registers
+  bool isSGPRClassID(unsigned RCID) const {
+    if (static_cast<int>(RCID) == -1)
+      return false;
+
+    return isSGPRClass(getRegClass(RCID));
+  }
+
+  /// \returns true if this class contains VGPR registers.
+  bool hasVGPRs(const TargetRegisterClass *RC) const;
+
+  /// \returns A VGPR reg class with the same width as \p SRC
+  const TargetRegisterClass *getEquivalentVGPRClass(
+                                          const TargetRegisterClass *SRC) const;
+
+  /// \returns The register class that is used for a sub-register of \p RC for
+  /// the given \p SubIdx.  If \p SubIdx equals NoSubRegister, \p RC will
+  /// be returned.
+  const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
+                                            unsigned SubIdx) const;
+
+  /// \p Channel This is the register channel (e.g. a value from 0-16), not the
+  ///            SubReg index.
+  /// \returns The sub-register of Reg that is in Channel.
+  unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC,
+                            unsigned Channel) const;
+
+  /// \returns True if operands defined with this operand type can accept
+  /// a literal constant (i.e. any 32-bit immediate).
+  bool opCanUseLiteralConstant(unsigned OpType) const;
+
+  /// \returns True if operands defined with this operand type can accept
+  /// an inline constant. i.e. An integer value in the range (-16, 64) or
+  /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. 
+  bool opCanUseInlineConstant(unsigned OpType) const;
+
+  enum PreloadedValue {
+    TGID_X,
+    TGID_Y,
+    TGID_Z,
+    SCRATCH_WAVE_OFFSET,
+    SCRATCH_PTR,
+    INPUT_PTR,
+    TIDIG_X,
+    TIDIG_Y,
+    TIDIG_Z
+  };
+
+  /// \brief Returns the physical register that \p Value is stored in.
+  unsigned getPreloadedValue(const MachineFunction &MF,
+                             enum PreloadedValue Value) const;
+
+  /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount
+  ///        concurrent waves.
+  unsigned getNumVGPRsAllowed(unsigned WaveCount) const;
+
+  /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
+  ///        concurrent waves.
+  unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
+                              unsigned WaveCount) const;
+
+  unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
+                              const TargetRegisterClass *RC) const;
+
+private:
+  void buildScratchLoadStore(MachineBasicBlock::iterator MI,
+                             unsigned LoadStoreOp, unsigned Value,
+                             unsigned ScratchRsrcReg, unsigned ScratchOffset,
+                             int64_t Offset, RegScavenger *RS) const;
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
new file mode 100644
index 00000000000..2a9017fa2a9
--- /dev/null
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -0,0 +1,284 @@
+//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the SI registers
+//===----------------------------------------------------------------------===//
+
+class SIReg <string n, bits<16> encoding = 0> : Register<n> {
+  let Namespace = "AMDGPU";
+  let HWEncoding = encoding;
+}
+
+// Special Registers
+def VCC_LO : SIReg<"vcc_lo", 106>;
+def VCC_HI : SIReg<"vcc_hi", 107>;
+
+// VCC for 64-bit instructions
+def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 106;
+}
+
+def EXEC_LO : SIReg<"exec_lo", 126>;
+def EXEC_HI : SIReg<"exec_hi", 127>;
+
+def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 126;
+}
+
+def SCC : SIReg<"scc", 253>;
+def M0 : SIReg <"m0", 124>;
+
+def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes.
+def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes.
+
+// Pair to indicate location of scratch space for flat accesses.
+def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 104;
+}
+
+// SGPR registers
+foreach Index = 0-101 in {
+  def SGPR#Index : SIReg <"SGPR"#Index, Index>;
+}
+
+// VGPR registers
+foreach Index = 0-255 in {
+  def VGPR#Index : SIReg <"VGPR"#Index, Index> {
+    let HWEncoding{8} = 1;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Groupings using register classes and tuples
+//===----------------------------------------------------------------------===//
+
+// SGPR 32-bit registers
+def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+                            (add (sequence "SGPR%u", 0, 101))>;
+
+// SGPR 64-bit registers
+def SGPR_64Regs : RegisterTuples<[sub0, sub1],
+                             [(add (decimate (trunc SGPR_32, 101), 2)),
+                              (add (decimate (shl SGPR_32, 1), 2))]>;
+
+// SGPR 128-bit registers
+def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
+                              [(add (decimate (trunc SGPR_32, 99), 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4))]>;
+
+// SGPR 256-bit registers
+def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
+                              [(add (decimate (trunc SGPR_32, 95), 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4)),
+                               (add (decimate (shl SGPR_32, 4), 4)),
+                               (add (decimate (shl SGPR_32, 5), 4)),
+                               (add (decimate (shl SGPR_32, 6), 4)),
+                               (add (decimate (shl SGPR_32, 7), 4))]>;
+
+// SGPR 512-bit registers
+def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
+                              [(add (decimate (trunc SGPR_32, 87), 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4)),
+                               (add (decimate (shl SGPR_32, 4), 4)),
+                               (add (decimate (shl SGPR_32, 5), 4)),
+                               (add (decimate (shl SGPR_32, 6), 4)),
+                               (add (decimate (shl SGPR_32, 7), 4)),
+                               (add (decimate (shl SGPR_32, 8), 4)),
+                               (add (decimate (shl SGPR_32, 9), 4)),
+                               (add (decimate (shl SGPR_32, 10), 4)),
+                               (add (decimate (shl SGPR_32, 11), 4)),
+                               (add (decimate (shl SGPR_32, 12), 4)),
+                               (add (decimate (shl SGPR_32, 13), 4)),
+                               (add (decimate (shl SGPR_32, 14), 4)),
+                               (add (decimate (shl SGPR_32, 15), 4))]>;
+
+// VGPR 32-bit registers
+def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+                            (add (sequence "VGPR%u", 0, 255))>;
+
+// VGPR 64-bit registers
+def VGPR_64 : RegisterTuples<[sub0, sub1],
+                             [(add (trunc VGPR_32, 255)),
+                              (add (shl VGPR_32, 1))]>;
+
+// VGPR 96-bit registers
+def VGPR_96 : RegisterTuples<[sub0, sub1, sub2],
+                             [(add (trunc VGPR_32, 254)),
+                              (add (shl VGPR_32, 1)),
+                              (add (shl VGPR_32, 2))]>;
+
+// VGPR 128-bit registers
+def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
+                              [(add (trunc VGPR_32, 253)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3))]>;
+
+// VGPR 256-bit registers
+def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
+                              [(add (trunc VGPR_32, 249)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3)),
+                               (add (shl VGPR_32, 4)),
+                               (add (shl VGPR_32, 5)),
+                               (add (shl VGPR_32, 6)),
+                               (add (shl VGPR_32, 7))]>;
+
+// VGPR 512-bit registers
+def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
+                              [(add (trunc VGPR_32, 241)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3)),
+                               (add (shl VGPR_32, 4)),
+                               (add (shl VGPR_32, 5)),
+                               (add (shl VGPR_32, 6)),
+                               (add (shl VGPR_32, 7)),
+                               (add (shl VGPR_32, 8)),
+                               (add (shl VGPR_32, 9)),
+                               (add (shl VGPR_32, 10)),
+                               (add (shl VGPR_32, 11)),
+                               (add (shl VGPR_32, 12)),
+                               (add (shl VGPR_32, 13)),
+                               (add (shl VGPR_32, 14)),
+                               (add (shl VGPR_32, 15))]>;
+
+//===----------------------------------------------------------------------===//
+//  Register classes used as source and destination
+//===----------------------------------------------------------------------===//
+
+class RegImmMatcher<string name> : AsmOperandClass {
+  let Name = name;
+  let RenderMethod = "addRegOrImmOperands";
+}
+
+// Special register classes for predicates and the M0 register
+def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> {
+  let CopyCost = -1; // Theoretically it is possible to read from SCC,
+                     // but it should never be necessary.
+}
+
+def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>;
+def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>;
+
+// Register class for all scalar registers (SGPRs + Special Registers)
+def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+  (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)
+>;
+
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>;
+
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64,
+  (add SGPR_64, VCCReg, EXECReg, FLAT_SCR)
+>;
+
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>;
+
+def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
+
+def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>;
+
+// Register class for all vector registers (VGPRs + Interploation Registers)
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
+
+def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
+  let Size = 96;
+}
+
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
+
+def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>;
+
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
+
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
+  let Size = 32;
+}
+
+class RegImmOperand <RegisterClass rc> : RegisterOperand<rc> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_REG_IMM32";
+}
+
+class RegInlineOperand <RegisterClass rc> : RegisterOperand<rc> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_REG_INLINE_C";
+}
+
+//===----------------------------------------------------------------------===//
+//  SSrc_* Operands with an SGPR or a 32-bit immediate
+//===----------------------------------------------------------------------===//
+
+def SSrc_32 : RegImmOperand<SReg_32> {
+  let ParserMatchClass = RegImmMatcher<"SSrc32">;
+}
+
+def SSrc_64 : RegImmOperand<SReg_64> {
+  let ParserMatchClass = RegImmMatcher<"SSrc64">;
+}
+
+//===----------------------------------------------------------------------===//
+//  SCSrc_* Operands with an SGPR or a inline constant
+//===----------------------------------------------------------------------===//
+
+def SCSrc_32 : RegInlineOperand<SReg_32> {
+  let ParserMatchClass = RegImmMatcher<"SCSrc32">;
+}
+
+//===----------------------------------------------------------------------===//
+//  VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
+//===----------------------------------------------------------------------===//
+
+def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>;
+
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+
+def VSrc_32 : RegisterOperand<VS_32> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_REG_IMM32";
+  let ParserMatchClass = RegImmMatcher<"VSrc32">;
+}
+
+def VSrc_64 : RegisterOperand<VS_64> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_REG_IMM32";
+  let ParserMatchClass = RegImmMatcher<"VSrc64">;
+}
+
+//===----------------------------------------------------------------------===//
+//  VCSrc_* Operands with an SGPR, VGPR or an inline constant
+//===----------------------------------------------------------------------===//
+
+def VCSrc_32 : RegisterOperand<VS_32> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_REG_INLINE_C";
+  let ParserMatchClass = RegImmMatcher<"VCSrc32">;
+}
+
+def VCSrc_64 : RegisterOperand<VS_64> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_REG_INLINE_C";
+  let ParserMatchClass = RegImmMatcher<"VCSrc64">;
+}
diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td
new file mode 100644
index 00000000000..9b1f676020b
--- /dev/null
+++ b/lib/Target/AMDGPU/SISchedule.td
@@ -0,0 +1,91 @@
+//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineModel definitions for Southern Islands (SI)
+//
+//===----------------------------------------------------------------------===//
+
+def WriteBranch : SchedWrite;
+def WriteExport : SchedWrite;
+def WriteLDS    : SchedWrite;
+def WriteSALU   : SchedWrite;
+def WriteSMEM   : SchedWrite;
+def WriteVMEM   : SchedWrite;
+
+// Vector ALU instructions
+def Write32Bit         : SchedWrite;
+def WriteQuarterRate32 : SchedWrite;
+
+def WriteFloatFMA   : SchedWrite;
+
+def WriteDouble     : SchedWrite;
+def WriteDoubleAdd  : SchedWrite;
+
+def SIFullSpeedModel : SchedMachineModel;
+def SIQuarterSpeedModel : SchedMachineModel;
+
+// BufferSize = 0 means the processors are in-order.
+let BufferSize = 0 in {
+
+// XXX: Are the resource counts correct?
+def HWBranch : ProcResource<1>;
+def HWExport : ProcResource<7>;   // Taken from S_WAITCNT
+def HWLGKM   : ProcResource<31>;  // Taken from S_WAITCNT
+def HWSALU   : ProcResource<1>;
+def HWVMEM   : ProcResource<15>;  // Taken from S_WAITCNT
+def HWVALU   : ProcResource<1>;
+
+}
+
+class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
+                 int latency> : WriteRes<write, resources> {
+  let Latency = latency;
+}
+
+class HWVALUWriteRes<SchedWrite write, int latency> :
+  HWWriteRes<write, [HWVALU], latency>;
+
+
+// The latency numbers are taken from AMD Accelerated Parallel Processing
+// guide.  They may not be acurate.
+
+// The latency values are 1 / (operations / cycle) / 4.
+multiclass SICommonWriteRes {
+
+  def : HWWriteRes<WriteBranch,  [HWBranch], 100>; // XXX: Guessed ???
+  def : HWWriteRes<WriteExport,  [HWExport], 100>; // XXX: Guessed ???
+  def : HWWriteRes<WriteLDS,     [HWLGKM],    32>; // 2 - 64
+  def : HWWriteRes<WriteSALU,    [HWSALU],     1>;
+  def : HWWriteRes<WriteSMEM,    [HWLGKM],    10>; // XXX: Guessed ???
+  def : HWWriteRes<WriteVMEM,    [HWVMEM],   450>; // 300 - 600
+
+  def : HWVALUWriteRes<Write32Bit,         1>;
+  def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+}
+
+
+let SchedModel = SIFullSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA,   1>;
+def : HWVALUWriteRes<WriteDouble,     4>;
+def : HWVALUWriteRes<WriteDoubleAdd,  2>;
+
+} // End SchedModel = SIFullSpeedModel
+
+let SchedModel = SIQuarterSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 16>;
+def : HWVALUWriteRes<WriteDouble,   16>;
+def : HWVALUWriteRes<WriteDoubleAdd, 8>;
+
+}  // End SchedModel = SIQuarterSpeedModel
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
new file mode 100644
index 00000000000..51e72cdb5f9
--- /dev/null
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -0,0 +1,272 @@
+//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// The pass tries to use the 32-bit encoding for instructions when possible.
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUMCInstLower.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-shrink-instructions"
+
+STATISTIC(NumInstructionsShrunk,
+          "Number of 64-bit instruction reduced to 32-bit.");
+STATISTIC(NumLiteralConstantsFolded,
+          "Number of literal constants folded into 32-bit instructions.");
+
+namespace llvm {
+  void initializeSIShrinkInstructionsPass(PassRegistry&);
+}
+
+using namespace llvm;
+
+namespace {
+
+class SIShrinkInstructions : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIShrinkInstructions() : MachineFunctionPass(ID) {
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Shrink Instructions";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE,
+                      "SI Lower il Copies", false, false)
+INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE,
+                    "SI Lower il Copies", false, false)
+
+char SIShrinkInstructions::ID = 0;
+
+FunctionPass *llvm::createSIShrinkInstructionsPass() {
+  return new SIShrinkInstructions();
+}
+
+static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
+                   const MachineRegisterInfo &MRI) {
+  if (!MO->isReg())
+    return false;
+
+  if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
+    return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
+
+  return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
+}
+
+static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
+                      const SIRegisterInfo &TRI,
+                      const MachineRegisterInfo &MRI) {
+
+  const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+  // Can't shrink instruction with three operands.
+  // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
+  // a special case for it.  It can only be shrunk if the third operand
+  // is vcc.  We should handle this the same way we handle vopc, by addding
+  // a register allocation hint pre-regalloc and then do the shrining
+  // post-regalloc.
+  if (Src2)
+    return false;
+
+  const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+  const MachineOperand *Src1Mod =
+      TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+
+  if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0)))
+    return false;
+
+  // We don't need to check src0, all input types are legal, so just make sure
+  // src0 isn't using any modifiers.
+  if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
+    return false;
+
+  // Check output modifiers
+  if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+    return false;
+
+  if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
+    return false;
+
+  return true;
+}
+
+/// \brief This function checks \p MI for operands defined by a move immediate
+/// instruction and then folds the literal constant into the instruction if it
+/// can.  This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
+/// and will only fold literal constants if we are still in SSA.
+static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
+                           MachineRegisterInfo &MRI, bool TryToCommute = true) {
+
+  if (!MRI.isSSA())
+    return;
+
+  assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) ||
+         TII->isVOPC(MI.getOpcode()));
+
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+  MachineOperand &Src0 = MI.getOperand(Src0Idx);
+
+  // Only one literal constant is allowed per instruction, so if src0 is a
+  // literal constant then we can't do any folding.
+  if (Src0.isImm() &&
+      TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx)))
+    return;
+
+  // Literal constants and SGPRs can only be used in Src0, so if Src0 is an
+  // SGPR, we cannot commute the instruction, so we can't fold any literal
+  // constants.
+  if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI))
+    return;
+
+  // Try to fold Src0
+  if (Src0.isReg()) {
+    unsigned Reg = Src0.getReg();
+    MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+    if (Def && Def->isMoveImmediate()) {
+      MachineOperand &MovSrc = Def->getOperand(1);
+      bool ConstantFolded = false;
+
+      if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) {
+        Src0.ChangeToImmediate(MovSrc.getImm());
+        ConstantFolded = true;
+      }
+      if (ConstantFolded) {
+        if (MRI.use_empty(Reg))
+          Def->eraseFromParent();
+        ++NumLiteralConstantsFolded;
+        return;
+      }
+    }
+  }
+
+  // We have failed to fold src0, so commute the instruction and try again.
+  if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI))
+    foldImmediates(MI, TII, MRI, false);
+
+}
+
+bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  std::vector<unsigned> I1Defs;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
+      if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
+        const MachineOperand &Src = MI.getOperand(1);
+
+        if (Src.isImm()) {
+          if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4))
+            MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
+        }
+
+        continue;
+      }
+
+      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
+        continue;
+
+      if (!canShrink(MI, TII, TRI, MRI)) {
+        // Try commuting the instruction and see if that enables us to shrink
+        // it.
+        if (!MI.isCommutable() || !TII->commuteInstruction(&MI) ||
+            !canShrink(MI, TII, TRI, MRI))
+          continue;
+      }
+
+      // getVOPe32 could be -1 here if we started with an instruction that had
+      // a 32-bit encoding and then commuted it to an instruction that did not.
+      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
+        continue;
+
+      int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
+
+      if (TII->isVOPC(Op32)) {
+        unsigned DstReg = MI.getOperand(0).getReg();
+        if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
+          // VOPC instructions can only write to the VCC register.  We can't
+          // force them to use VCC here, because the register allocator has
+          // trouble with sequences like this, which cause the allocator to run
+          // out of registers if vreg0 and vreg1 belong to the VCCReg register
+          // class:
+          // vreg0 = VOPC;
+          // vreg1 = VOPC;
+          // S_AND_B64 vreg0, vreg1
+          //
+          // So, instead of forcing the instruction to write to VCC, we provide
+          // a hint to the register allocator to use VCC and then we we will run
+          // this pass again after RA and shrink it if it outputs to VCC.
+          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
+          continue;
+        }
+        if (DstReg != AMDGPU::VCC)
+          continue;
+      }
+
+      // We can shrink this instruction
+      DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';);
+
+      MachineInstrBuilder Inst32 =
+          BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
+
+      // dst
+      Inst32.addOperand(MI.getOperand(0));
+
+      Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
+
+      const MachineOperand *Src1 =
+          TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+      if (Src1)
+        Inst32.addOperand(*Src1);
+
+      ++NumInstructionsShrunk;
+      MI.eraseFromParent();
+
+      foldImmediates(*Inst32, TII, MRI);
+      DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
+
+
+    }
+  }
+  return false;
+}
diff --git a/lib/Target/AMDGPU/SITypeRewriter.cpp b/lib/Target/AMDGPU/SITypeRewriter.cpp
new file mode 100644
index 00000000000..591ce857cc7
--- /dev/null
+++ b/lib/Target/AMDGPU/SITypeRewriter.cpp
@@ -0,0 +1,161 @@
+//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass removes performs the following type substitution on all
+/// non-compute shaders:
+///
+/// v16i8 => i128
+///   - v16i8 is used for constant memory resource descriptors.  This type is
+///      legal for some compute APIs, and we don't want to declare it as legal
+///      in the backend, because we want the legalizer to expand all v16i8
+///      operations.
+/// v1* => *
+///   - Having v1* types complicates the legalizer and we can easily replace
+///   - them with the element type.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+
+using namespace llvm;
+
+namespace {
+
+class SITypeRewriter : public FunctionPass,
+                       public InstVisitor<SITypeRewriter> {
+
+  static char ID;
+  Module *Mod;
+  Type *v16i8;
+  Type *v4i32;
+
+public:
+  SITypeRewriter() : FunctionPass(ID) { }
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+  const char *getPassName() const override {
+    return "SI Type Rewriter";
+  }
+  void visitLoadInst(LoadInst &I);
+  void visitCallInst(CallInst &I);
+  void visitBitCast(BitCastInst &I);
+};
+
+} // End anonymous namespace
+
+char SITypeRewriter::ID = 0;
+
+bool SITypeRewriter::doInitialization(Module &M) {
+  Mod = &M;
+  v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
+  v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4);
+  return false;
+}
+
+bool SITypeRewriter::runOnFunction(Function &F) {
+  Attribute A = F.getFnAttribute("ShaderType");
+
+  unsigned ShaderType = ShaderType::COMPUTE;
+  if (A.isStringAttribute()) {
+    StringRef Str = A.getValueAsString();
+    Str.getAsInteger(0, ShaderType);
+  }
+  if (ShaderType == ShaderType::COMPUTE)
+    return false;
+
+  visit(F);
+  visit(F);
+
+  return false;
+}
+
+void SITypeRewriter::visitLoadInst(LoadInst &I) {
+  Value *Ptr = I.getPointerOperand();
+  Type *PtrTy = Ptr->getType();
+  Type *ElemTy = PtrTy->getPointerElementType();
+  IRBuilder<> Builder(&I);
+  if (ElemTy == v16i8)  {
+    Value *BitCast = Builder.CreateBitCast(Ptr,
+        PointerType::get(v4i32,PtrTy->getPointerAddressSpace()));
+    LoadInst *Load = Builder.CreateLoad(BitCast);
+    SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+    I.getAllMetadataOtherThanDebugLoc(MD);
+    for (unsigned i = 0, e = MD.size(); i != e; ++i) {
+      Load->setMetadata(MD[i].first, MD[i].second);
+    }
+    Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType());
+    I.replaceAllUsesWith(BitCastLoad);
+    I.eraseFromParent();
+  }
+}
+
+void SITypeRewriter::visitCallInst(CallInst &I) {
+  IRBuilder<> Builder(&I);
+
+  SmallVector <Value*, 8> Args;
+  SmallVector <Type*, 8> Types;
+  bool NeedToReplace = false;
+  Function *F = I.getCalledFunction();
+  std::string Name = F->getName();
+  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+    Value *Arg = I.getArgOperand(i);
+    if (Arg->getType() == v16i8) {
+      Args.push_back(Builder.CreateBitCast(Arg, v4i32));
+      Types.push_back(v4i32);
+      NeedToReplace = true;
+      Name = Name + ".v4i32";
+    } else if (Arg->getType()->isVectorTy() &&
+               Arg->getType()->getVectorNumElements() == 1 &&
+               Arg->getType()->getVectorElementType() ==
+                                              Type::getInt32Ty(I.getContext())){
+      Type *ElementTy = Arg->getType()->getVectorElementType();
+      std::string TypeName = "i32";
+      InsertElementInst *Def = cast<InsertElementInst>(Arg);
+      Args.push_back(Def->getOperand(1));
+      Types.push_back(ElementTy);
+      std::string VecTypeName = "v1" + TypeName;
+      Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName);
+      NeedToReplace = true;
+    } else {
+      Args.push_back(Arg);
+      Types.push_back(Arg->getType());
+    }
+  }
+
+  if (!NeedToReplace) {
+    return;
+  }
+  Function *NewF = Mod->getFunction(Name);
+  if (!NewF) {
+    NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod);
+    NewF->setAttributes(F->getAttributes());
+  }
+  I.replaceAllUsesWith(Builder.CreateCall(NewF, Args));
+  I.eraseFromParent();
+}
+
+void SITypeRewriter::visitBitCast(BitCastInst &I) {
+  IRBuilder<> Builder(&I);
+  if (I.getDestTy() != v4i32) {
+    return;
+  }
+
+  if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
+    if (Op->getSrcTy() == v4i32) {
+      I.replaceAllUsesWith(Op->getOperand(0));
+      I.eraseFromParent();
+    }
+  }
+}
+
+FunctionPass *llvm::createSITypeRewriter() {
+  return new SITypeRewriter();
+}
diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
new file mode 100644
index 00000000000..2112135aa5d
--- /dev/null
+++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -0,0 +1,30 @@
+//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+/// \brief The target which suports all AMD GPUs.  This will eventually
+///         be deprecated and there will be a R600 target and a GCN target.
+Target llvm::TheAMDGPUTarget;
+/// \brief The target for GCN GPUs
+Target llvm::TheGCNTarget;
+
+/// \brief Extern function to initialize the targets for the AMDGPU backend
+extern "C" void LLVMInitializeAMDGPUTargetInfo() {
+  RegisterTarget<Triple::r600, false>
+    R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX");
+  RegisterTarget<Triple::amdgcn, false> GCN(TheGCNTarget, "amdgcn", "AMD GCN GPUs");
+}
diff --git a/lib/Target/AMDGPU/TargetInfo/CMakeLists.txt b/lib/Target/AMDGPU/TargetInfo/CMakeLists.txt
new file mode 100644
index 00000000000..961dc550900
--- /dev/null
+++ b/lib/Target/AMDGPU/TargetInfo/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMAMDGPUInfo
+  AMDGPUTargetInfo.cpp
+  )
diff --git a/lib/Target/AMDGPU/TargetInfo/LLVMBuild.txt b/lib/Target/AMDGPU/TargetInfo/LLVMBuild.txt
new file mode 100644
index 00000000000..291317fa072
--- /dev/null
+++ b/lib/Target/AMDGPU/TargetInfo/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/AMDGPU/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AMDGPUInfo
+parent = AMDGPU
+required_libraries = Support
+add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/TargetInfo/Makefile b/lib/Target/AMDGPU/TargetInfo/Makefile
new file mode 100644
index 00000000000..b8ac4e78230
--- /dev/null
+++ b/lib/Target/AMDGPU/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/AMDGPU/TargetInfo/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMR600Info
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AMDGPU/VIInstrFormats.td b/lib/Target/AMDGPU/VIInstrFormats.td
new file mode 100644
index 00000000000..d8738f99263
--- /dev/null
+++ b/lib/Target/AMDGPU/VIInstrFormats.td
@@ -0,0 +1,166 @@
+//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// VI Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class DSe_vi <bits<8> op> : Enc64 {
+  bits<8> vdst;
+  bits<1> gds;
+  bits<8> addr;
+  bits<8> data0;
+  bits<8> data1;
+  bits<8> offset0;
+  bits<8> offset1;
+
+  let Inst{7-0} = offset0;
+  let Inst{15-8} = offset1;
+  let Inst{16} = gds;
+  let Inst{24-17} = op;
+  let Inst{31-26} = 0x36; //encoding
+  let Inst{39-32} = addr;
+  let Inst{47-40} = data0;
+  let Inst{55-48} = data1;
+  let Inst{63-56} = vdst;
+}
+
+class MUBUFe_vi <bits<7> op> : Enc64 {
+  bits<12> offset;
+  bits<1> offen;
+  bits<1> idxen;
+  bits<1> glc;
+  bits<1> lds;
+  bits<8> vaddr;
+  bits<8> vdata;
+  bits<7> srsrc;
+  bits<1> slc;
+  bits<1> tfe;
+  bits<8> soffset;
+
+  let Inst{11-0} = offset;
+  let Inst{12} = offen;
+  let Inst{13} = idxen;
+  let Inst{14} = glc;
+  let Inst{16} = lds;
+  let Inst{17} = slc;
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x38; //encoding
+  let Inst{39-32} = vaddr;
+  let Inst{47-40} = vdata;
+  let Inst{52-48} = srsrc{6-2};
+  let Inst{55} = tfe;
+  let Inst{63-56} = soffset;
+}
+
+class MTBUFe_vi <bits<4> op> : Enc64 {
+  bits<12> offset;
+  bits<1>  offen;
+  bits<1>  idxen;
+  bits<1>  glc;
+  bits<4>  dfmt;
+  bits<3>  nfmt;
+  bits<8>  vaddr;
+  bits<8>  vdata;
+  bits<7>  srsrc;
+  bits<1>  slc;
+  bits<1>  tfe;
+  bits<8>  soffset;
+
+  let Inst{11-0}  = offset;
+  let Inst{12}    = offen;
+  let Inst{13}    = idxen;
+  let Inst{14}    = glc;
+  let Inst{18-15} = op;
+  let Inst{22-19} = dfmt;
+  let Inst{25-23} = nfmt;
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = vaddr;
+  let Inst{47-40} = vdata;
+  let Inst{52-48} = srsrc{6-2};
+  let Inst{54}    = slc;
+  let Inst{55}    = tfe;
+  let Inst{63-56} = soffset;
+}
+
+class SMEMe_vi <bits<8> op, bit imm> : Enc64 {
+  bits<7>  sbase;
+  bits<7>  sdata;
+  bits<1>  glc;
+  bits<20> offset;
+
+  let Inst{5-0}   = sbase{6-1};
+  let Inst{12-6}  = sdata;
+  let Inst{16}    = glc;
+  let Inst{17}    = imm;
+  let Inst{25-18} = op;
+  let Inst{31-26} = 0x30; //encoding
+  let Inst{51-32} = offset;
+}
+
+class VOP3e_vi <bits<10> op> : Enc64 {
+  bits<8> vdst;
+  bits<2> src0_modifiers;
+  bits<9> src0;
+  bits<2> src1_modifiers;
+  bits<9> src1;
+  bits<2> src2_modifiers;
+  bits<9> src2;
+  bits<1> clamp;
+  bits<2> omod;
+
+  let Inst{7-0}   = vdst;
+  let Inst{8}     = src0_modifiers{1};
+  let Inst{9}     = src1_modifiers{1};
+  let Inst{10}    = src2_modifiers{1};
+  let Inst{15}    = clamp;
+  let Inst{25-16} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
+}
+
+class VOP3be_vi <bits<10> op> : Enc64 {
+  bits<8> vdst;
+  bits<2> src0_modifiers;
+  bits<9> src0;
+  bits<2> src1_modifiers;
+  bits<9> src1;
+  bits<2> src2_modifiers;
+  bits<9> src2;
+  bits<7> sdst;
+  bits<2> omod;
+  bits<1> clamp;
+
+  let Inst{7-0} = vdst;
+  let Inst{14-8} = sdst;
+  let Inst{15} = clamp;
+  let Inst{25-16} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
+}
+
+class EXPe_vi : EXPe {
+  let Inst{31-26} = 0x31; //encoding
+}
+
+class VINTRPe_vi <bits<2> op> : VINTRPe <op> {
+  let Inst{31-26} = 0x35; // encoding
+}
diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td
new file mode 100644
index 00000000000..5bf86e649ce
--- /dev/null
+++ b/lib/Target/AMDGPU/VIInstructions.td
@@ -0,0 +1,106 @@
+//===-- VIInstructions.td - VI Instruction Defintions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Instruction definitions for VI and newer.
+//===----------------------------------------------------------------------===//
+
+let SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI in {
+
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+defm V_CVT_F16_U16 : VOP1Inst <vop1<0, 0x39>, "v_cvt_f16_u16", VOP_F16_I16>;
+defm V_CVT_F16_I16 : VOP1Inst <vop1<0, 0x3a>, "v_cvt_f16_i16", VOP_F16_I16>;
+defm V_CVT_U16_F16 : VOP1Inst <vop1<0, 0x3b>, "v_cvt_u16_f16", VOP_I16_F16>;
+defm V_CVT_I16_F16 : VOP1Inst <vop1<0, 0x3c>, "v_cvt_i16_f16", VOP_I16_F16>;
+defm V_RCP_F16 : VOP1Inst <vop1<0, 0x3d>, "v_rcp_f16", VOP_F16_F16>;
+defm V_SQRT_F16 : VOP1Inst <vop1<0, 0x3e>, "v_sqrt_f16", VOP_F16_F16>;
+defm V_RSQ_F16 : VOP1Inst <vop1<0, 0x3f>, "v_rsq_f16", VOP_F16_F16>;
+defm V_LOG_F16 : VOP1Inst <vop1<0, 0x40>, "v_log_f16", VOP_F16_F16>;
+defm V_EXP_F16 : VOP1Inst <vop1<0, 0x41>, "v_exp_f16", VOP_F16_F16>;
+defm V_FREXP_MANT_F16 : VOP1Inst <vop1<0, 0x42>, "v_frexp_mant_f16",
+  VOP_F16_F16
+>;
+defm V_FREXP_EXP_I16_F16 : VOP1Inst <vop1<0, 0x43>, "v_frexp_exp_i16_f16",
+  VOP_I16_F16
+>;
+defm V_FLOOR_F16 : VOP1Inst <vop1<0, 0x44>, "v_floor_f16", VOP_F16_F16>;
+defm V_CEIL_F16 : VOP1Inst <vop1<0, 0x45>, "v_ceil_f16", VOP_F16_F16>;
+defm V_TRUNC_F16 : VOP1Inst <vop1<0, 0x46>, "v_trunc_f16", VOP_F16_F16>;
+defm V_RNDNE_F16 : VOP1Inst <vop1<0, 0x47>, "v_rndne_f16", VOP_F16_F16>;
+defm V_FRACT_F16 : VOP1Inst <vop1<0, 0x48>, "v_fract_f16", VOP_F16_F16>;
+defm V_SIN_F16 : VOP1Inst <vop1<0, 0x49>, "v_sin_f16", VOP_F16_F16>;
+defm V_COS_F16 : VOP1Inst <vop1<0, 0x4a>, "v_cos_f16", VOP_F16_F16>;
+
+//===----------------------------------------------------------------------===//
+// VOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+let isCommutable = 1 in {
+
+defm V_ADD_F16 : VOP2Inst <vop2<0, 0x1f>, "v_add_f16", VOP_F16_F16_F16>;
+defm V_SUB_F16 : VOP2Inst <vop2<0, 0x20>, "v_sub_f16", VOP_F16_F16_F16>;
+defm V_SUBREV_F16 : VOP2Inst <vop2<0, 0x21>, "v_subrev_f16", VOP_F16_F16_F16,
+  null_frag, "v_sub_f16"
+>;
+defm V_MUL_F16 : VOP2Inst <vop2<0, 0x22>, "v_mul_f16", VOP_F16_F16_F16>;
+defm V_MAC_F16 : VOP2Inst <vop2<0, 0x23>, "v_mac_f16", VOP_F16_F16_F16>;
+} // End isCommutable = 1
+defm V_MADMK_F16 : VOP2MADK <vop2<0,0x24>, "v_madmk_f16">;
+let isCommutable = 1 in {
+defm V_MADAK_F16 : VOP2MADK <vop2<0,0x25>, "v_madak_f16">;
+defm V_ADD_U16 : VOP2Inst <vop2<0,0x26>, "v_add_u16", VOP_I16_I16_I16>;
+defm V_SUB_U16 : VOP2Inst <vop2<0,0x27>, "v_sub_u16" , VOP_I16_I16_I16>;
+defm V_SUBREV_U16 : VOP2Inst <vop2<0,0x28>, "v_subrev_u16", VOP_I16_I16_I16>;
+defm V_MUL_LO_U16 : VOP2Inst <vop2<0,0x29>, "v_mul_lo_u16", VOP_I16_I16_I16>;
+} // End isCommutable = 1
+defm V_LSHLREV_B16 : VOP2Inst <vop2<0,0x2a>, "v_lshlrev_b16", VOP_I16_I16_I16>;
+defm V_LSHRREV_B16 : VOP2Inst <vop2<0,0x2b>, "v_lshrrev_b16", VOP_I16_I16_I16>;
+defm V_ASHRREV_B16 : VOP2Inst <vop2<0,0x2c>, "v_ashrrev_b16", VOP_I16_I16_I16>;
+let isCommutable = 1 in {
+defm V_MAX_F16 : VOP2Inst <vop2<0,0x2d>, "v_max_f16", VOP_F16_F16_F16>;
+defm V_MIN_F16 : VOP2Inst <vop2<0,0x2e>, "v_min_f16", VOP_F16_F16_F16>;
+defm V_MAX_U16 : VOP2Inst <vop2<0,0x2f>, "v_max_u16", VOP_I16_I16_I16>;
+defm V_MAX_I16 : VOP2Inst <vop2<0,0x30>, "v_max_i16", VOP_I16_I16_I16>;
+defm V_MIN_U16 : VOP2Inst <vop2<0,0x31>, "v_min_u16", VOP_I16_I16_I16>;
+defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>;
+} // End isCommutable = 1
+defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>;
+
+// Aliases to simplify matching of floating-pint instructions that are VOP2 on
+// SI and VOP3 on VI.
+
+class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
+  name#" $dst, $src0, $src1",
+  (inst VGPR_32:$dst, 0, VCSrc_32:$src0, 0, VCSrc_32:$src1, 0, 0)
+>, PredicateControl {
+  let UseInstAsmMatchConverter = 0;
+}
+
+def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
+
+} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
+
+//===----------------------------------------------------------------------===//
+// SMEM Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isVI] in {
+
+// 1. Offset as 20bit DWORD immediate
+def : Pat <
+  (SIload_constant v4i32:$sbase, IMM20bit:$offset),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
+>;
+
+} // End Predicates = [isVI]
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index 3af3426b94c..ab823248928 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -19,6 +19,7 @@
 ; will typically require only insertion of a line.
 [common]
 subdirectories =
+ AMDGPU
  ARM
  AArch64
  BPF
@@ -28,7 +29,6 @@ subdirectories =
  NVPTX
  Mips
  PowerPC
- R600
  Sparc
  SystemZ
  X86
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
deleted file mode 100644
index 0a05d25189b..00000000000
--- a/lib/Target/R600/AMDGPU.h
+++ /dev/null
@@ -1,148 +0,0 @@
-//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H
-#define LLVM_LIB_TARGET_R600_AMDGPU_H
-
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetMachine.h"
-
-namespace llvm {
-
-class AMDGPUInstrPrinter;
-class AMDGPUSubtarget;
-class AMDGPUTargetMachine;
-class FunctionPass;
-class MCAsmInfo;
-class raw_ostream;
-class Target;
-class TargetMachine;
-
-// R600 Passes
-FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
-FunctionPass *createR600TextureIntrinsicsReplacer();
-FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
-FunctionPass *createR600EmitClauseMarkers();
-FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
-FunctionPass *createR600Packetizer(TargetMachine &tm);
-FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
-FunctionPass *createAMDGPUCFGStructurizerPass();
-
-// SI Passes
-FunctionPass *createSITypeRewriter();
-FunctionPass *createSIAnnotateControlFlowPass();
-FunctionPass *createSIFoldOperandsPass();
-FunctionPass *createSILowerI1CopiesPass();
-FunctionPass *createSIShrinkInstructionsPass();
-FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
-FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
-FunctionPass *createSIFixControlFlowLiveIntervalsPass();
-FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
-FunctionPass *createSIFixSGPRLiveRangesPass();
-FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
-FunctionPass *createSIInsertWaits(TargetMachine &tm);
-FunctionPass *createSIPrepareScratchRegs();
-
-void initializeSIFoldOperandsPass(PassRegistry &);
-extern char &SIFoldOperandsID;
-
-void initializeSILowerI1CopiesPass(PassRegistry &);
-extern char &SILowerI1CopiesID;
-
-void initializeSILoadStoreOptimizerPass(PassRegistry &);
-extern char &SILoadStoreOptimizerID;
-
-// Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
-Pass *createAMDGPUStructurizeCFGPass();
-FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
-ModulePass *createAMDGPUAlwaysInlinePass();
-
-void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
-extern char &SIFixControlFlowLiveIntervalsID;
-
-void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
-extern char &SIFixSGPRLiveRangesID;
-
-
-extern Target TheAMDGPUTarget;
-extern Target TheGCNTarget;
-
-namespace AMDGPU {
-enum TargetIndex {
-  TI_CONSTDATA_START,
-  TI_SCRATCH_RSRC_DWORD0,
-  TI_SCRATCH_RSRC_DWORD1,
-  TI_SCRATCH_RSRC_DWORD2,
-  TI_SCRATCH_RSRC_DWORD3
-};
-}
-
-#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel"
-
-} // End namespace llvm
-
-namespace ShaderType {
-  enum Type {
-    PIXEL = 0,
-    VERTEX = 1,
-    GEOMETRY = 2,
-    COMPUTE = 3
-  };
-}
-
-/// OpenCL uses address spaces to differentiate between
-/// various memory regions on the hardware. On the CPU
-/// all of the address spaces point to the same memory,
-/// however on the GPU, each address space points to
-/// a separate piece of memory that is unique from other
-/// memory locations.
-namespace AMDGPUAS {
-enum AddressSpaces : unsigned {
-  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
-  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
-  CONSTANT_ADDRESS = 2, ///< Address space for constant memory
-  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
-  FLAT_ADDRESS     = 4, ///< Address space for flat memory.
-  REGION_ADDRESS   = 5, ///< Address space for region memory.
-  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
-  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
-
-  // Do not re-order the CONSTANT_BUFFER_* enums.  Several places depend on this
-  // order to be able to dynamically index a constant buffer, for example:
-  //
-  // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
-
-  CONSTANT_BUFFER_0 = 8,
-  CONSTANT_BUFFER_1 = 9,
-  CONSTANT_BUFFER_2 = 10,
-  CONSTANT_BUFFER_3 = 11,
-  CONSTANT_BUFFER_4 = 12,
-  CONSTANT_BUFFER_5 = 13,
-  CONSTANT_BUFFER_6 = 14,
-  CONSTANT_BUFFER_7 = 15,
-  CONSTANT_BUFFER_8 = 16,
-  CONSTANT_BUFFER_9 = 17,
-  CONSTANT_BUFFER_10 = 18,
-  CONSTANT_BUFFER_11 = 19,
-  CONSTANT_BUFFER_12 = 20,
-  CONSTANT_BUFFER_13 = 21,
-  CONSTANT_BUFFER_14 = 22,
-  CONSTANT_BUFFER_15 = 23,
-  ADDRESS_NONE = 24, ///< Address space for unknown memory.
-  LAST_ADDRESS = ADDRESS_NONE,
-
-  // Some places use this if the address space can't be determined.
-  UNKNOWN_ADDRESS_SPACE = ~0u
-};
-
-} // namespace AMDGPUAS
-
-#endif
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
deleted file mode 100644
index 2e7e39a54d3..00000000000
--- a/lib/Target/R600/AMDGPU.td
+++ /dev/null
@@ -1,266 +0,0 @@
-//===-- AMDGPU.td - AMDGPU Tablegen files ------------------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-include "llvm/Target/Target.td"
-
-//===----------------------------------------------------------------------===//
-// Subtarget Features
-//===----------------------------------------------------------------------===//
-
-// Debugging Features
-
-def FeatureDumpCode : SubtargetFeature <"DumpCode",
-        "DumpCode",
-        "true",
-        "Dump MachineInstrs in the CodeEmitter">;
-
-def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
-        "DumpCode",
-        "true",
-        "Dump MachineInstrs in the CodeEmitter">;
-
-def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer",
-        "EnableIRStructurizer",
-        "false",
-        "Disable IR Structurizer">;
-
-def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
-        "EnablePromoteAlloca",
-        "true",
-        "Enable promote alloca pass">;
-
-// Target features
-
-def FeatureIfCvt : SubtargetFeature <"disable-ifcvt",
-        "EnableIfCvt",
-        "false",
-        "Disable the if conversion pass">;
-
-def FeatureFP64 : SubtargetFeature<"fp64",
-        "FP64",
-        "true",
-        "Enable double precision operations">;
-
-def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
-        "FP64Denormals",
-        "true",
-        "Enable double precision denormal handling",
-        [FeatureFP64]>;
-
-def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
-        "FastFMAF32",
-        "true",
-        "Assuming f32 fma is at least as fast as mul + add",
-        []>;
-
-// Some instructions do not support denormals despite this flag. Using
-// fp32 denormals also causes instructions to run at the double
-// precision rate for the device.
-def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
-        "FP32Denormals",
-        "true",
-        "Enable single precision denormal handling">;
-
-def Feature64BitPtr : SubtargetFeature<"64BitPtr",
-        "Is64bit",
-        "true",
-        "Specify if 64-bit addressing should be used">;
-
-def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
-        "R600ALUInst",
-        "false",
-        "Older version of ALU instructions encoding">;
-
-def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
-        "HasVertexCache",
-        "true",
-        "Specify use of dedicated vertex cache">;
-
-def FeatureCaymanISA : SubtargetFeature<"caymanISA",
-        "CaymanISA",
-        "true",
-        "Use Cayman ISA">;
-
-def FeatureCFALUBug : SubtargetFeature<"cfalubug",
-        "CFALUBug",
-        "true",
-        "GPU has CF_ALU bug">;
-
-// XXX - This should probably be removed once enabled by default
-def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
-        "EnableLoadStoreOpt",
-        "true",
-        "Enable SI load/store optimizer pass">;
-
-def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
-        "FlatAddressSpace",
-        "true",
-        "Support flat address space">;
-
-def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
-        "EnableVGPRSpilling",
-        "true",
-        "Enable spilling of VGPRs to scratch memory">;
-
-def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
-        "SGPRInitBug",
-        "true",
-        "VI SGPR initilization bug requiring a fixed SGPR allocation size">;
-
-class SubtargetFeatureFetchLimit <string Value> :
-                          SubtargetFeature <"fetch"#Value,
-        "TexVTXClauseSize",
-        Value,
-        "Limit the maximum number of fetches in a clause to "#Value>;
-
-def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
-def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
-
-class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
-        "wavefrontsize"#Value,
-        "WavefrontSize",
-        !cast<string>(Value),
-        "The number of threads per wavefront">;
-
-def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
-def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
-def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
-
-class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
-      "ldsbankcount"#Value,
-      "LDSBankCount",
-      !cast<string>(Value),
-      "The number of LDS banks per compute unit.">;
-
-def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
-def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
-
-class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
-        "localmemorysize"#Value,
-        "LocalMemorySize",
-        !cast<string>(Value),
-        "The size of local memory in bytes">;
-
-def FeatureGCN : SubtargetFeature<"gcn",
-        "IsGCN",
-        "true",
-        "GCN or newer GPU">;
-
-def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding",
-        "GCN1Encoding",
-        "true",
-        "Encoding format for SI and CI">;
-
-def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
-        "GCN3Encoding",
-        "true",
-        "Encoding format for VI">;
-
-def FeatureCIInsts : SubtargetFeature<"ci-insts",
-        "CIInsts",
-        "true",
-        "Additional intstructions for CI+">;
-
-// Dummy feature used to disable assembler instructions.
-def FeatureDisable : SubtargetFeature<"",
-                                      "FeatureDisable","true",
-                                      "Dummy feature to disable assembler"
-                                      " instructions">;
-
-class SubtargetFeatureGeneration <string Value,
-                                  list<SubtargetFeature> Implies> :
-        SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
-                          Value#" GPU generation", Implies>;
-
-def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
-def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
-def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
-
-def FeatureR600 : SubtargetFeatureGeneration<"R600",
-        [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>;
-
-def FeatureR700 : SubtargetFeatureGeneration<"R700",
-        [FeatureFetchLimit16, FeatureLocalMemorySize0]>;
-
-def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
-        [FeatureFetchLimit16, FeatureLocalMemorySize32768]>;
-
-def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
-        [FeatureFetchLimit16, FeatureWavefrontSize64,
-         FeatureLocalMemorySize32768]
->;
-
-def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
-        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768,
-         FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding,
-         FeatureLDSBankCount32]>;
-
-def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
-        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
-         FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
-         FeatureGCN1Encoding, FeatureCIInsts]>;
-
-def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
-        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
-         FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
-         FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>;
-
-//===----------------------------------------------------------------------===//
-
-def AMDGPUInstrInfo : InstrInfo {
-  let guessInstructionProperties = 1;
-  let noNamedPositionallyEncodedOperands = 1;
-}
-
-def AMDGPUAsmParser : AsmParser {
-  // Some of the R600 registers have the same name, so this crashes.
-  // For example T0_XYZW and T0_XY both have the asm name T0.
-  let ShouldEmitMatchRegisterName = 0;
-}
-
-def AMDGPU : Target {
-  // Pull in Instruction Info:
-  let InstructionSet = AMDGPUInstrInfo;
-  let AssemblyParsers = [AMDGPUAsmParser];
-}
-
-// Dummy Instruction itineraries for pseudo instructions
-def ALU_NULL : FuncUnit;
-def NullALU : InstrItinClass;
-
-//===----------------------------------------------------------------------===//
-// Predicate helper class
-//===----------------------------------------------------------------------===//
-
-def TruePredicate : Predicate<"true">;
-def isSICI : Predicate<
-  "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
-  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
->, AssemblerPredicate<"FeatureGCN1Encoding">;
-
-class PredicateControl {
-  Predicate SubtargetPredicate;
-  Predicate SIAssemblerPredicate = isSICI;
-  list<Predicate> AssemblerPredicates = [];
-  Predicate AssemblerPredicate = TruePredicate;
-  list<Predicate> OtherPredicates = [];
-  list<Predicate> Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate],
-                                            AssemblerPredicates,
-                                            OtherPredicates);
-}
-
-// Include AMDGPU TD files
-include "R600Schedule.td"
-include "SISchedule.td"
-include "Processors.td"
-include "AMDGPUInstrInfo.td"
-include "AMDGPUIntrinsics.td"
-include "AMDGPURegisterInfo.td"
-include "AMDGPUInstructions.td"
-include "AMDGPUCallingConv.td"
diff --git a/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp b/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp
deleted file mode 100644
index 0b426bc63dd..00000000000
--- a/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass marks all internal functions as always_inline and creates
-/// duplicates of all other functions a marks the duplicates as always_inline.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-
-using namespace llvm;
-
-namespace {
-
-class AMDGPUAlwaysInline : public ModulePass {
-
-  static char ID;
-
-public:
-  AMDGPUAlwaysInline() : ModulePass(ID) { }
-  bool runOnModule(Module &M) override;
-  const char *getPassName() const override { return "AMDGPU Always Inline Pass"; }
-};
-
-} // End anonymous namespace
-
-char AMDGPUAlwaysInline::ID = 0;
-
-bool AMDGPUAlwaysInline::runOnModule(Module &M) {
-
-  std::vector<Function*> FuncsToClone;
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    Function &F = *I;
-    if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
-        !F.hasFnAttribute(Attribute::NoInline))
-      FuncsToClone.push_back(&F);
-  }
-
-  for (Function *F : FuncsToClone) {
-    ValueToValueMapTy VMap;
-    Function *NewFunc = CloneFunction(F, VMap, false);
-    NewFunc->setLinkage(GlobalValue::InternalLinkage);
-    F->getParent()->getFunctionList().push_back(NewFunc);
-    F->replaceAllUsesWith(NewFunc);
-  }
-
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    Function &F = *I;
-    if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) {
-      F.addFnAttr(Attribute::AlwaysInline);
-    }
-  }
-  return false;
-}
-
-ModulePass *llvm::createAMDGPUAlwaysInlinePass() {
-  return new AMDGPUAlwaysInline();
-}
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
deleted file mode 100644
index 56b50a9c159..00000000000
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ /dev/null
@@ -1,600 +0,0 @@
-//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
-/// code.  When passed an MCAsmStreamer it prints assembly and when passed
-/// an MCObjectStreamer it outputs binary code.
-//
-//===----------------------------------------------------------------------===//
-//
-
-#include "AMDGPUAsmPrinter.h"
-#include "InstPrinter/AMDGPUInstPrinter.h"
-#include "AMDGPU.h"
-#include "AMDKernelCodeT.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600MachineFunctionInfo.h"
-#include "R600RegisterInfo.h"
-#include "SIDefines.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-
-using namespace llvm;
-
-// TODO: This should get the default rounding mode from the kernel. We just set
-// the default here, but this could change if the OpenCL rounding mode pragmas
-// are used.
-//
-// The denormal mode here should match what is reported by the OpenCL runtime
-// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
-// can also be override to flush with the -cl-denorms-are-zero compiler flag.
-//
-// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
-// precision, and leaves single precision to flush all and does not report
-// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
-// CL_FP_DENORM for both.
-//
-// FIXME: It seems some instructions do not support single precision denormals
-// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
-// and sin_f32, cos_f32 on most parts).
-
-// We want to use these instructions, and using fp32 denormals also causes
-// instructions to run at the double precision rate for the device so it's
-// probably best to just report no single precision denormals.
-static uint32_t getFPMode(const MachineFunction &F) {
-  const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>();
-  // TODO: Is there any real use for the flush in only / flush out only modes?
-
-  uint32_t FP32Denormals =
-    ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
-
-  uint32_t FP64Denormals =
-    ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
-
-  return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
-         FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
-         FP_DENORM_MODE_SP(FP32Denormals) |
-         FP_DENORM_MODE_DP(FP64Denormals);
-}
-
-static AsmPrinter *
-createAMDGPUAsmPrinterPass(TargetMachine &tm,
-                           std::unique_ptr<MCStreamer> &&Streamer) {
-  return new AMDGPUAsmPrinter(tm, std::move(Streamer));
-}
-
-extern "C" void LLVMInitializeR600AsmPrinter() {
-  TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
-  TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass);
-}
-
-AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
-                                   std::unique_ptr<MCStreamer> Streamer)
-    : AsmPrinter(TM, std::move(Streamer)) {}
-
-void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
-
-  // This label is used to mark the end of the .text section.
-  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
-  OutStreamer->SwitchSection(TLOF.getTextSection());
-  MCSymbol *EndOfTextLabel =
-      OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-  OutStreamer->EmitLabel(EndOfTextLabel);
-}
-
-bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
-
-  // The starting address of all shader programs must be 256 bytes aligned.
-  MF.setAlignment(8);
-
-  SetupMachineFunction(MF);
-
-  MCContext &Context = getObjFileLowering().getContext();
-  MCSectionELF *ConfigSection =
-      Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
-  OutStreamer->SwitchSection(ConfigSection);
-
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
-  SIProgramInfo KernelInfo;
-  if (STM.isAmdHsaOS()) {
-    getSIProgramInfo(KernelInfo, MF);
-    EmitAmdKernelCodeT(MF, KernelInfo);
-    OutStreamer->EmitCodeAlignment(2 << (MF.getAlignment() - 1));
-  } else if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    getSIProgramInfo(KernelInfo, MF);
-    EmitProgramInfoSI(MF, KernelInfo);
-  } else {
-    EmitProgramInfoR600(MF);
-  }
-
-  DisasmLines.clear();
-  HexLines.clear();
-  DisasmLineMaxLen = 0;
-
-  EmitFunctionBody();
-
-  if (isVerbose()) {
-    MCSectionELF *CommentSection =
-        Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
-    OutStreamer->SwitchSection(CommentSection);
-
-    if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-      OutStreamer->emitRawComment(" Kernel info:", false);
-      OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
-                                  false);
-      OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
-                                  false);
-      OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
-                                  false);
-      OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
-                                  false);
-      OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
-                                  false);
-      OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
-                                  false);
-    } else {
-      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-      OutStreamer->emitRawComment(
-        Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
-    }
-  }
-
-  if (STM.dumpCode()) {
-
-    OutStreamer->SwitchSection(
-        Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
-
-    for (size_t i = 0; i < DisasmLines.size(); ++i) {
-      std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
-      Comment += " ; " + HexLines[i] + "\n";
-
-      OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
-      OutStreamer->EmitBytes(StringRef(Comment));
-    }
-  }
-
-  return false;
-}
-
-void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
-  unsigned MaxGPR = 0;
-  bool killPixel = false;
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
-  const R600RegisterInfo *RI =
-      static_cast<const R600RegisterInfo *>(STM.getRegisterInfo());
-  const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      if (MI.getOpcode() == AMDGPU::KILLGT)
-        killPixel = true;
-      unsigned numOperands = MI.getNumOperands();
-      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
-        const MachineOperand &MO = MI.getOperand(op_idx);
-        if (!MO.isReg())
-          continue;
-        unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
-
-        // Register with value > 127 aren't GPR
-        if (HWReg > 127)
-          continue;
-        MaxGPR = std::max(MaxGPR, HWReg);
-      }
-    }
-  }
-
-  unsigned RsrcReg;
-  if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
-    // Evergreen / Northern Islands
-    switch (MFI->getShaderType()) {
-    default: // Fall through
-    case ShaderType::COMPUTE:  RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
-    case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
-    case ShaderType::PIXEL:    RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
-    case ShaderType::VERTEX:   RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
-    }
-  } else {
-    // R600 / R700
-    switch (MFI->getShaderType()) {
-    default: // Fall through
-    case ShaderType::GEOMETRY: // Fall through
-    case ShaderType::COMPUTE:  // Fall through
-    case ShaderType::VERTEX:   RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
-    case ShaderType::PIXEL:    RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
-    }
-  }
-
-  OutStreamer->EmitIntValue(RsrcReg, 4);
-  OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
-                           S_STACK_SIZE(MFI->StackSize), 4);
-  OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
-  OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
-
-  if (MFI->getShaderType() == ShaderType::COMPUTE) {
-    OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
-    OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
-  }
-}
-
-void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
-                                        const MachineFunction &MF) const {
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  uint64_t CodeSize = 0;
-  unsigned MaxSGPR = 0;
-  unsigned MaxVGPR = 0;
-  bool VCCUsed = false;
-  bool FlatUsed = false;
-  const SIRegisterInfo *RI =
-      static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
-
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      // TODO: CodeSize should account for multiple functions.
-      CodeSize += MI.getDesc().Size;
-
-      unsigned numOperands = MI.getNumOperands();
-      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
-        const MachineOperand &MO = MI.getOperand(op_idx);
-        unsigned width = 0;
-        bool isSGPR = false;
-
-        if (!MO.isReg()) {
-          continue;
-        }
-        unsigned reg = MO.getReg();
-        if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO ||
-	    reg == AMDGPU::VCC_HI) {
-          VCCUsed = true;
-          continue;
-        } else if (reg == AMDGPU::FLAT_SCR ||
-                   reg == AMDGPU::FLAT_SCR_LO ||
-                   reg == AMDGPU::FLAT_SCR_HI) {
-          FlatUsed = true;
-          continue;
-        }
-
-        switch (reg) {
-        default: break;
-        case AMDGPU::SCC:
-        case AMDGPU::EXEC:
-        case AMDGPU::M0:
-          continue;
-        }
-
-        if (AMDGPU::SReg_32RegClass.contains(reg)) {
-          isSGPR = true;
-          width = 1;
-        } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
-          isSGPR = false;
-          width = 1;
-        } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
-          isSGPR = true;
-          width = 2;
-        } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
-          isSGPR = false;
-          width = 2;
-        } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
-          isSGPR = false;
-          width = 3;
-        } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
-          isSGPR = true;
-          width = 4;
-        } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
-          isSGPR = false;
-          width = 4;
-        } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
-          isSGPR = true;
-          width = 8;
-        } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
-          isSGPR = false;
-          width = 8;
-        } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
-          isSGPR = true;
-          width = 16;
-        } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
-          isSGPR = false;
-          width = 16;
-        } else {
-          llvm_unreachable("Unknown register class");
-        }
-        unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
-        unsigned maxUsed = hwReg + width - 1;
-        if (isSGPR) {
-          MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
-        } else {
-          MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
-        }
-      }
-    }
-  }
-
-  if (VCCUsed)
-    MaxSGPR += 2;
-
-  if (FlatUsed)
-    MaxSGPR += 2;
-
-  // We found the maximum register index. They start at 0, so add one to get the
-  // number of registers.
-  ProgInfo.NumVGPR = MaxVGPR + 1;
-  ProgInfo.NumSGPR = MaxSGPR + 1;
-
-  if (STM.hasSGPRInitBug()) {
-    if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG)
-      llvm_unreachable("Too many SGPRs used with the SGPR init bug");
-
-    ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
-  }
-
-  ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
-  ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
-  // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
-  // register.
-  ProgInfo.FloatMode = getFPMode(MF);
-
-  // XXX: Not quite sure what this does, but sc seems to unset this.
-  ProgInfo.IEEEMode = 0;
-
-  // Do not clamp NAN to 0.
-  ProgInfo.DX10Clamp = 0;
-
-  const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-  ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
-
-  ProgInfo.FlatUsed = FlatUsed;
-  ProgInfo.VCCUsed = VCCUsed;
-  ProgInfo.CodeLen = CodeSize;
-
-  unsigned LDSAlignShift;
-  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
-    // LDS is allocated in 64 dword blocks.
-    LDSAlignShift = 8;
-  } else {
-    // LDS is allocated in 128 dword blocks.
-    LDSAlignShift = 9;
-  }
-
-  unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
-                          MFI->getMaximumWorkGroupSize(MF);
-
-  ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
-  ProgInfo.LDSBlocks =
-     RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
-
-  // Scratch is allocated in 256 dword blocks.
-  unsigned ScratchAlignShift = 10;
-  // We need to program the hardware with the amount of scratch memory that
-  // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
-  // scratch memory used per thread.
-  ProgInfo.ScratchBlocks =
-    RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(),
-                       1 << ScratchAlignShift) >> ScratchAlignShift;
-
-  ProgInfo.ComputePGMRSrc1 =
-      S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
-      S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
-      S_00B848_PRIORITY(ProgInfo.Priority) |
-      S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
-      S_00B848_PRIV(ProgInfo.Priv) |
-      S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
-      S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
-      S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
-
-  ProgInfo.ComputePGMRSrc2 =
-      S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
-      S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
-      S_00B84C_TGID_X_EN(1) |
-      S_00B84C_TGID_Y_EN(1) |
-      S_00B84C_TGID_Z_EN(1) |
-      S_00B84C_TG_SIZE_EN(1) |
-      S_00B84C_TIDIG_COMP_CNT(2) |
-      S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
-}
-
-static unsigned getRsrcReg(unsigned ShaderType) {
-  switch (ShaderType) {
-  default: // Fall through
-  case ShaderType::COMPUTE:  return R_00B848_COMPUTE_PGM_RSRC1;
-  case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
-  case ShaderType::PIXEL:    return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
-  case ShaderType::VERTEX:   return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
-  }
-}
-
-void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
-                                         const SIProgramInfo &KernelInfo) {
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  unsigned RsrcReg = getRsrcReg(MFI->getShaderType());
-
-  if (MFI->getShaderType() == ShaderType::COMPUTE) {
-    OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
-
-    OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
-
-    OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
-    OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
-
-    OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
-    OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
-
-    // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
-    // 0" comment but I don't see a corresponding field in the register spec.
-  } else {
-    OutStreamer->EmitIntValue(RsrcReg, 4);
-    OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
-                              S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
-    if (STM.isVGPRSpillingEnabled(MFI)) {
-      OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
-      OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
-    }
-  }
-
-  if (MFI->getShaderType() == ShaderType::PIXEL) {
-    OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
-    OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
-    OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
-    OutStreamer->EmitIntValue(MFI->PSInputAddr, 4);
-  }
-}
-
-void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
-                                        const SIProgramInfo &KernelInfo) const {
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
-  amd_kernel_code_t header;
-
-  memset(&header, 0, sizeof(header));
-
-  header.amd_code_version_major = AMD_CODE_VERSION_MAJOR;
-  header.amd_code_version_minor = AMD_CODE_VERSION_MINOR;
-
-  header.struct_byte_size = sizeof(amd_kernel_code_t);
-
-  header.target_chip = STM.getAmdKernelCodeChipID();
-
-  header.kernel_code_entry_byte_offset = (1ULL << MF.getAlignment());
-
-  header.compute_pgm_resource_registers =
-      KernelInfo.ComputePGMRSrc1 |
-      (KernelInfo.ComputePGMRSrc2 << 32);
-
-  // Code Properties:
-  header.code_properties = AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
-                           AMD_CODE_PROPERTY_IS_PTR64;
-
-  if (KernelInfo.FlatUsed)
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
-
-  if (KernelInfo.ScratchBlocks)
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
-
-  header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
-  header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
-
-  // MFI->ABIArgOffset is the number of bytes for the kernel arguments
-  // plus 36.  36 is the number of bytes reserved at the begining of the
-  // input buffer to store work-group size information.
-  // FIXME: We should be adding the size of the implicit arguments
-  // to this value.
-  header.kernarg_segment_byte_size = MFI->ABIArgOffset;
-
-  header.wavefront_sgpr_count = KernelInfo.NumSGPR;
-  header.workitem_vgpr_count = KernelInfo.NumVGPR;
-
-  // FIXME: What values do I put for these alignments
-  header.kernarg_segment_alignment = 0;
-  header.group_segment_alignment = 0;
-  header.private_segment_alignment = 0;
-
-  header.code_type = 1; // HSA_EXT_CODE_KERNEL
-
-  header.wavefront_size = STM.getWavefrontSize();
-
-  MCSectionELF *VersionSection =
-      OutContext.getELFSection(".hsa.version", ELF::SHT_PROGBITS, 0);
-  OutStreamer->SwitchSection(VersionSection);
-  OutStreamer->EmitBytes(Twine("HSA Code Unit:" +
-                         Twine(header.hsail_version_major) + "." +
-                         Twine(header.hsail_version_minor) + ":" +
-                         "AMD:" +
-                         Twine(header.amd_code_version_major) + "." +
-                         Twine(header.amd_code_version_minor) +  ":" +
-                         "GFX8.1:0").str());
-
-  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
-
-  if (isVerbose()) {
-    OutStreamer->emitRawComment("amd_code_version_major = " +
-                                Twine(header.amd_code_version_major), false);
-    OutStreamer->emitRawComment("amd_code_version_minor = " +
-                                Twine(header.amd_code_version_minor), false);
-    OutStreamer->emitRawComment("struct_byte_size = " +
-                                Twine(header.struct_byte_size), false);
-    OutStreamer->emitRawComment("target_chip = " +
-                                Twine(header.target_chip), false);
-    OutStreamer->emitRawComment(" compute_pgm_rsrc1: " +
-                                Twine::utohexstr(KernelInfo.ComputePGMRSrc1),
-                                false);
-    OutStreamer->emitRawComment(" compute_pgm_rsrc2: " +
-                                Twine::utohexstr(KernelInfo.ComputePGMRSrc2),
-                                false);
-    OutStreamer->emitRawComment("enable_sgpr_private_segment_buffer = " +
-      Twine((bool)(header.code_properties &
-                   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)), false);
-    OutStreamer->emitRawComment("enable_sgpr_kernarg_segment_ptr = " +
-      Twine((bool)(header.code_properties &
-                   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)), false);
-    OutStreamer->emitRawComment("private_element_size = 2 ", false);
-    OutStreamer->emitRawComment("is_ptr64 = " +
-        Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_IS_PTR64)), false);
-    OutStreamer->emitRawComment("workitem_private_segment_byte_size = " +
-                                Twine(header.workitem_private_segment_byte_size),
-                                false);
-    OutStreamer->emitRawComment("workgroup_group_segment_byte_size = " +
-                                Twine(header.workgroup_group_segment_byte_size),
-                                false);
-    OutStreamer->emitRawComment("gds_segment_byte_size = " +
-                                Twine(header.gds_segment_byte_size), false);
-    OutStreamer->emitRawComment("kernarg_segment_byte_size = " +
-                                Twine(header.kernarg_segment_byte_size), false);
-    OutStreamer->emitRawComment("wavefront_sgpr_count = " +
-                                Twine(header.wavefront_sgpr_count), false);
-    OutStreamer->emitRawComment("workitem_vgpr_count = " +
-                                Twine(header.workitem_vgpr_count), false);
-    OutStreamer->emitRawComment("code_type = " + Twine(header.code_type), false);
-    OutStreamer->emitRawComment("wavefront_size = " +
-                                Twine((int)header.wavefront_size), false);
-    OutStreamer->emitRawComment("optimization_level = " +
-                                Twine(header.optimization_level), false);
-    OutStreamer->emitRawComment("hsail_profile = " +
-                                Twine(header.hsail_profile), false);
-    OutStreamer->emitRawComment("hsail_machine_model = " +
-                                Twine(header.hsail_machine_model), false);
-    OutStreamer->emitRawComment("hsail_version_major = " +
-                                Twine(header.hsail_version_major), false);
-    OutStreamer->emitRawComment("hsail_version_minor = " +
-                                Twine(header.hsail_version_minor), false);
-  }
-
-  OutStreamer->EmitBytes(StringRef((char*)&header, sizeof(header)));
-}
-
-bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                                       unsigned AsmVariant,
-                                       const char *ExtraCode, raw_ostream &O) {
-  if (ExtraCode && ExtraCode[0]) {
-    if (ExtraCode[1] != 0)
-      return true; // Unknown modifier.
-
-    switch (ExtraCode[0]) {
-    default:
-      // See if this is a generic print operand
-      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
-    case 'r':
-      break;
-    }
-  }
-
-  AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O,
-                   *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
-  return false;
-}
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
deleted file mode 100644
index 1acff3a3222..00000000000
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ /dev/null
@@ -1,113 +0,0 @@
-//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief AMDGPU Assembly printer class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
-#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
-
-#include "llvm/CodeGen/AsmPrinter.h"
-#include <vector>
-
-namespace llvm {
-
-class AMDGPUAsmPrinter : public AsmPrinter {
-private:
-  struct SIProgramInfo {
-    SIProgramInfo() :
-      VGPRBlocks(0),
-      SGPRBlocks(0),
-      Priority(0),
-      FloatMode(0),
-      Priv(0),
-      DX10Clamp(0),
-      DebugMode(0),
-      IEEEMode(0),
-      ScratchSize(0),
-      ComputePGMRSrc1(0),
-      LDSBlocks(0),
-      ScratchBlocks(0),
-      ComputePGMRSrc2(0),
-      NumVGPR(0),
-      NumSGPR(0),
-      FlatUsed(false),
-      VCCUsed(false),
-      CodeLen(0) {}
-
-    // Fields set in PGM_RSRC1 pm4 packet.
-    uint32_t VGPRBlocks;
-    uint32_t SGPRBlocks;
-    uint32_t Priority;
-    uint32_t FloatMode;
-    uint32_t Priv;
-    uint32_t DX10Clamp;
-    uint32_t DebugMode;
-    uint32_t IEEEMode;
-    uint32_t ScratchSize;
-
-    uint64_t ComputePGMRSrc1;
-
-    // Fields set in PGM_RSRC2 pm4 packet.
-    uint32_t LDSBlocks;
-    uint32_t ScratchBlocks;
-
-    uint64_t ComputePGMRSrc2;
-
-    uint32_t NumVGPR;
-    uint32_t NumSGPR;
-    uint32_t LDSSize;
-    bool FlatUsed;
-
-    // Bonus information for debugging.
-    bool VCCUsed;
-    uint64_t CodeLen;
-  };
-
-  void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
-  void findNumUsedRegistersSI(const MachineFunction &MF,
-                              unsigned &NumSGPR,
-                              unsigned &NumVGPR) const;
-
-  /// \brief Emit register usage information so that the GPU driver
-  /// can correctly setup the GPU state.
-  void EmitProgramInfoR600(const MachineFunction &MF);
-  void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
-  void EmitAmdKernelCodeT(const MachineFunction &MF,
-                          const SIProgramInfo &KernelInfo) const;
-
-public:
-  explicit AMDGPUAsmPrinter(TargetMachine &TM,
-                            std::unique_ptr<MCStreamer> Streamer);
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "AMDGPU Assembly Printer";
-  }
-
-  /// Implemented in AMDGPUMCInstLower.cpp
-  void EmitInstruction(const MachineInstr *MI) override;
-
-  void EmitEndOfAsmFile(Module &M) override;
-
-  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                       unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &O) override;
-
-protected:
-  std::vector<std::string> DisasmLines, HexLines;
-  size_t DisasmLineMaxLen;
-};
-
-} // End anonymous llvm
-
-#endif
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
deleted file mode 100644
index 6ffa7a08358..00000000000
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ /dev/null
@@ -1,82 +0,0 @@
-//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This describes the calling conventions for the AMD Radeon GPUs.
-//
-//===----------------------------------------------------------------------===//
-
-// Inversion of CCIfInReg
-class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
-
-// Calling convention for SI
-def CC_SI : CallingConv<[
-
-  CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
-    SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
-    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
-    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21
-  ]>>>,
-
-  CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
-    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
-  >>>,
-
-  CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
-    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
-    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
-    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
-    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
-  ]>>>,
-
-  CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
-    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
-  >>>
-
-]>;
-
-// Calling convention for R600
-def CC_R600 : CallingConv<[
-  CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
-    T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
-    T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
-    T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
-    T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
-    T30_XYZW, T31_XYZW, T32_XYZW
-  ]>>>
-]>;
-
-// Calling convention for compute kernels
-def CC_AMDGPU_Kernel : CallingConv<[
-  CCCustom<"allocateStack">
-]>;
-
-def CC_AMDGPU : CallingConv<[
-  CCIf<"static_cast<const AMDGPUSubtarget&>"
-        "(State.getMachineFunction().getSubtarget()).getGeneration() >="
-          "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-        "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()"
-         "->getShaderType() == ShaderType::COMPUTE",
-       CCDelegateTo<CC_AMDGPU_Kernel>>,
-  CCIf<"static_cast<const AMDGPUSubtarget&>"
-        "(State.getMachineFunction().getSubtarget()).getGeneration() < "
-          "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-         "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()"
-          "->getShaderType() == ShaderType::COMPUTE",
-        CCDelegateTo<CC_AMDGPU_Kernel>>,
-   CCIf<"static_cast<const AMDGPUSubtarget&>"
-         "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
-           "AMDGPUSubtarget::SOUTHERN_ISLANDS",
-        CCDelegateTo<CC_SI>>,
-   CCIf<"static_cast<const AMDGPUSubtarget&>"
-          "(State.getMachineFunction().getSubtarget()).getGeneration() < "
-            "AMDGPUSubtarget::SOUTHERN_ISLANDS",
-        CCDelegateTo<CC_R600>>
-]>;
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
deleted file mode 100644
index 8175786fb9b..00000000000
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-//===----------------------- AMDGPUFrameLowering.cpp ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-// Interface to describe a layout of a stack frame on a AMDIL target machine
-//
-//===----------------------------------------------------------------------===//
-#include "AMDGPUFrameLowering.h"
-#include "AMDGPURegisterInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Instructions.h"
-
-using namespace llvm;
-AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
-    int LAO, unsigned TransAl)
-  : TargetFrameLowering(D, StackAl, LAO, TransAl) { }
-
-AMDGPUFrameLowering::~AMDGPUFrameLowering() { }
-
-unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
-
-  // XXX: Hardcoding to 1 for now.
-  //
-  // I think the StackWidth should stored as metadata associated with the
-  // MachineFunction.  This metadata can either be added by a frontend, or
-  // calculated by a R600 specific LLVM IR pass.
-  //
-  // The StackWidth determines how stack objects are laid out in memory.
-  // For a vector stack variable, like: int4 stack[2], the data will be stored
-  // in the following ways depending on the StackWidth.
-  //
-  // StackWidth = 1:
-  //
-  // T0.X = stack[0].x
-  // T1.X = stack[0].y
-  // T2.X = stack[0].z
-  // T3.X = stack[0].w
-  // T4.X = stack[1].x
-  // T5.X = stack[1].y
-  // T6.X = stack[1].z
-  // T7.X = stack[1].w
-  //
-  // StackWidth = 2:
-  //
-  // T0.X = stack[0].x
-  // T0.Y = stack[0].y
-  // T1.X = stack[0].z
-  // T1.Y = stack[0].w
-  // T2.X = stack[1].x
-  // T2.Y = stack[1].y
-  // T3.X = stack[1].z
-  // T3.Y = stack[1].w
-  // 
-  // StackWidth = 4:
-  // T0.X = stack[0].x
-  // T0.Y = stack[0].y
-  // T0.Z = stack[0].z
-  // T0.W = stack[0].w
-  // T1.X = stack[1].x
-  // T1.Y = stack[1].y
-  // T1.Z = stack[1].z
-  // T1.W = stack[1].w
-  return 1;
-}
-
-/// \returns The number of registers allocated for \p FI.
-int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                         int FI) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  // Start the offset at 2 so we don't overwrite work group information.
-  // XXX: We should only do this when the shader actually uses this
-  // information.
-  unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
-  int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
-
-  for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
-    OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i));
-    OffsetBytes += MFI->getObjectSize(i);
-    // Each register holds 4 bytes, so we must always align the offset to at
-    // least 4 bytes, so that 2 frame objects won't share the same register.
-    OffsetBytes = RoundUpToAlignment(OffsetBytes, 4);
-  }
-
-  if (FI != -1)
-    OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI));
-
-  return OffsetBytes / (getStackWidth(MF) * 4);
-}
-
-const TargetFrameLowering::SpillSlot *
-AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
-  NumEntries = 0;
-  return nullptr;
-}
-void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF,
-                                       MachineBasicBlock &MBB) const {}
-void
-AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF,
-                                  MachineBasicBlock &MBB) const {
-}
-
-bool
-AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const {
-  return false;
-}
diff --git a/lib/Target/R600/AMDGPUFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h
deleted file mode 100644
index 9f31be1af79..00000000000
--- a/lib/Target/R600/AMDGPUFrameLowering.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Interface to describe a layout of a stack frame on a AMDIL target
-/// machine.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
-#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
-
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Target/TargetFrameLowering.h"
-
-namespace llvm {
-
-/// \brief Information about the stack frame layout on the AMDGPU targets.
-///
-/// It holds the direction of the stack growth, the known stack alignment on
-/// entry to each function, and the offset to the locals area.
-/// See TargetFrameInfo for more comments.
-class AMDGPUFrameLowering : public TargetFrameLowering {
-public:
-  AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
-                      unsigned TransAl = 1);
-  virtual ~AMDGPUFrameLowering();
-
-  /// \returns The number of 32-bit sub-registers that are used when storing
-  /// values to the stack.
-  unsigned getStackWidth(const MachineFunction &MF) const;
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
-  const SpillSlot *
-    getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
-  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
-  bool hasFP(const MachineFunction &MF) const override;
-};
-} // namespace llvm
-#endif
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
deleted file mode 100644
index df4461eac4d..00000000000
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ /dev/null
@@ -1,1371 +0,0 @@
-//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Defines an instruction selector for the AMDGPU target.
-//
-//===----------------------------------------------------------------------===//
-#include "AMDGPUInstrInfo.h"
-#include "AMDGPUISelLowering.h" // For AMDGPUISD
-#include "AMDGPURegisterInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "R600InstrInfo.h"
-#include "SIDefines.h"
-#include "SIISelLowering.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/Function.h"
-
-using namespace llvm;
-
-//===----------------------------------------------------------------------===//
-// Instruction Selector Implementation
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// AMDGPU specific code to select AMDGPU machine instructions for
-/// SelectionDAG operations.
-class AMDGPUDAGToDAGISel : public SelectionDAGISel {
-  // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
-  // make the right decision when generating code for different targets.
-  const AMDGPUSubtarget *Subtarget;
-public:
-  AMDGPUDAGToDAGISel(TargetMachine &TM);
-  virtual ~AMDGPUDAGToDAGISel();
-  bool runOnMachineFunction(MachineFunction &MF) override;
-  SDNode *Select(SDNode *N) override;
-  const char *getPassName() const override;
-  void PostprocessISelDAG() override;
-
-private:
-  bool isInlineImmediate(SDNode *N) const;
-  bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
-                   const R600InstrInfo *TII);
-  bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
-  bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
-
-  // Complex pattern selectors
-  bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
-  bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
-  bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
-
-  static bool checkType(const Value *ptr, unsigned int addrspace);
-  static bool checkPrivateAddress(const MachineMemOperand *Op);
-
-  static bool isGlobalStore(const StoreSDNode *N);
-  static bool isFlatStore(const StoreSDNode *N);
-  static bool isPrivateStore(const StoreSDNode *N);
-  static bool isLocalStore(const StoreSDNode *N);
-  static bool isRegionStore(const StoreSDNode *N);
-
-  bool isCPLoad(const LoadSDNode *N) const;
-  bool isConstantLoad(const LoadSDNode *N, int cbID) const;
-  bool isGlobalLoad(const LoadSDNode *N) const;
-  bool isFlatLoad(const LoadSDNode *N) const;
-  bool isParamLoad(const LoadSDNode *N) const;
-  bool isPrivateLoad(const LoadSDNode *N) const;
-  bool isLocalLoad(const LoadSDNode *N) const;
-  bool isRegionLoad(const LoadSDNode *N) const;
-
-  SDNode *glueCopyToM0(SDNode *N) const;
-
-  const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
-  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
-  bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
-                                       SDValue& Offset);
-  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
-  bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
-  bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
-                       unsigned OffsetBits) const;
-  bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
-  bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
-                                 SDValue &Offset1) const;
-  void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
-                   SDValue &SOffset, SDValue &Offset, SDValue &Offen,
-                   SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
-                   SDValue &TFE) const;
-  bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
-                         SDValue &SOffset, SDValue &Offset, SDValue &GLC,
-                         SDValue &SLC, SDValue &TFE) const;
-  bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
-                         SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
-                         SDValue &SLC) const;
-  bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
-                          SDValue &SOffset, SDValue &ImmOffset) const;
-  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
-                         SDValue &Offset, SDValue &GLC, SDValue &SLC,
-                         SDValue &TFE) const;
-  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
-                         SDValue &Offset, SDValue &GLC) const;
-  SDNode *SelectAddrSpaceCast(SDNode *N);
-  bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-  bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
-                       SDValue &Clamp, SDValue &Omod) const;
-
-  bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
-                            SDValue &Omod) const;
-  bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods,
-                                 SDValue &Clamp,
-                                 SDValue &Omod) const;
-
-  SDNode *SelectADD_SUB_I64(SDNode *N);
-  SDNode *SelectDIV_SCALE(SDNode *N);
-
-  SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
-                   uint32_t Offset, uint32_t Width);
-  SDNode *SelectS_BFEFromShifts(SDNode *N);
-  SDNode *SelectS_BFE(SDNode *N);
-
-  // Include the pieces autogenerated from the target description.
-#include "AMDGPUGenDAGISel.inc"
-};
-}  // end anonymous namespace
-
-/// \brief This pass converts a legalized DAG into a AMDGPU-specific
-// DAG, ready for instruction scheduling.
-FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) {
-  return new AMDGPUDAGToDAGISel(TM);
-}
-
-AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
-    : SelectionDAGISel(TM) {}
-
-bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &static_cast<const AMDGPUSubtarget &>(MF.getSubtarget());
-  return SelectionDAGISel::runOnMachineFunction(MF);
-}
-
-AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
-}
-
-bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const {
-  const SITargetLowering *TL
-      = static_cast<const SITargetLowering *>(getTargetLowering());
-  return TL->analyzeImmediate(N) == 0;
-}
-
-/// \brief Determine the register class for \p OpNo
-/// \returns The register class of the virtual register that will be used for
-/// the given operand number \OpNo or NULL if the register class cannot be
-/// determined.
-const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
-                                                          unsigned OpNo) const {
-  if (!N->isMachineOpcode())
-    return nullptr;
-
-  switch (N->getMachineOpcode()) {
-  default: {
-    const MCInstrDesc &Desc =
-        Subtarget->getInstrInfo()->get(N->getMachineOpcode());
-    unsigned OpIdx = Desc.getNumDefs() + OpNo;
-    if (OpIdx >= Desc.getNumOperands())
-      return nullptr;
-    int RegClass = Desc.OpInfo[OpIdx].RegClass;
-    if (RegClass == -1)
-      return nullptr;
-
-    return Subtarget->getRegisterInfo()->getRegClass(RegClass);
-  }
-  case AMDGPU::REG_SEQUENCE: {
-    unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-    const TargetRegisterClass *SuperRC =
-        Subtarget->getRegisterInfo()->getRegClass(RCID);
-
-    SDValue SubRegOp = N->getOperand(OpNo + 1);
-    unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
-    return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
-                                                              SubRegIdx);
-  }
-  }
-}
-
-bool AMDGPUDAGToDAGISel::SelectADDRParam(
-  SDValue Addr, SDValue& R1, SDValue& R2) {
-
-  if (Addr.getOpcode() == ISD::FrameIndex) {
-    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
-      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
-    } else {
-      R1 = Addr;
-      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
-    }
-  } else if (Addr.getOpcode() == ISD::ADD) {
-    R1 = Addr.getOperand(0);
-    R2 = Addr.getOperand(1);
-  } else {
-    R1 = Addr;
-    R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
-  }
-  return true;
-}
-
-bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
-  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress) {
-    return false;
-  }
-  return SelectADDRParam(Addr, R1, R2);
-}
-
-
-bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
-  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress) {
-    return false;
-  }
-
-  if (Addr.getOpcode() == ISD::FrameIndex) {
-    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
-      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
-    } else {
-      R1 = Addr;
-      R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
-    }
-  } else if (Addr.getOpcode() == ISD::ADD) {
-    R1 = Addr.getOperand(0);
-    R2 = Addr.getOperand(1);
-  } else {
-    R1 = Addr;
-    R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
-  }
-  return true;
-}
-
-SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
-  if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-      !checkType(cast<MemSDNode>(N)->getMemOperand()->getValue(),
-                 AMDGPUAS::LOCAL_ADDRESS))
-    return N;
-
-  const SITargetLowering& Lowering =
-      *static_cast<const SITargetLowering*>(getTargetLowering());
-
-  // Write max value to m0 before each load operation
-
-  SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
-                                 CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
-
-  SDValue Glue = M0.getValue(1);
-
-  SmallVector <SDValue, 8> Ops;
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-     Ops.push_back(N->getOperand(i));
-  }
-  Ops.push_back(Glue);
-  CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
-
-  return N;
-}
-
-SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
-  unsigned int Opc = N->getOpcode();
-  if (N->isMachineOpcode()) {
-    N->setNodeId(-1);
-    return nullptr;   // Already selected.
-  }
-
-  if (isa<AtomicSDNode>(N))
-    N = glueCopyToM0(N);
-
-  switch (Opc) {
-  default: break;
-  // We are selecting i64 ADD here instead of custom lower it during
-  // DAG legalization, so we can fold some i64 ADDs used for address
-  // calculation into the LOAD and STORE instructions.
-  case ISD::ADD:
-  case ISD::SUB: {
-    if (N->getValueType(0) != MVT::i64 ||
-        Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
-      break;
-
-    return SelectADD_SUB_I64(N);
-  }
-  case ISD::SCALAR_TO_VECTOR:
-  case AMDGPUISD::BUILD_VERTICAL_VECTOR:
-  case ISD::BUILD_VECTOR: {
-    unsigned RegClassID;
-    const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
-    EVT VT = N->getValueType(0);
-    unsigned NumVectorElts = VT.getVectorNumElements();
-    EVT EltVT = VT.getVectorElementType();
-    assert(EltVT.bitsEq(MVT::i32));
-    if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-      bool UseVReg = true;
-      for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
-                                                    U != E; ++U) {
-        if (!U->isMachineOpcode()) {
-          continue;
-        }
-        const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
-        if (!RC) {
-          continue;
-        }
-        if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) {
-          UseVReg = false;
-        }
-      }
-      switch(NumVectorElts) {
-      case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID :
-                                     AMDGPU::SReg_32RegClassID;
-        break;
-      case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
-                                     AMDGPU::SReg_64RegClassID;
-        break;
-      case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
-                                     AMDGPU::SReg_128RegClassID;
-        break;
-      case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
-                                     AMDGPU::SReg_256RegClassID;
-        break;
-      case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
-                                      AMDGPU::SReg_512RegClassID;
-        break;
-      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
-      }
-    } else {
-      // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
-      // that adds a 128 bits reg copy when going through TwoAddressInstructions
-      // pass. We want to avoid 128 bits copies as much as possible because they
-      // can't be bundled by our scheduler.
-      switch(NumVectorElts) {
-      case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
-      case 4:
-        if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
-          RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
-        else
-          RegClassID = AMDGPU::R600_Reg128RegClassID;
-        break;
-      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
-      }
-    }
-
-    SDLoc DL(N);
-    SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
-
-    if (NumVectorElts == 1) {
-      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT,
-                                  N->getOperand(0), RegClass);
-    }
-
-    assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
-                                  "supported yet");
-    // 16 = Max Num Vector Elements
-    // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
-    // 1 = Vector Register Class
-    SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
-
-    RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
-    bool IsRegSeq = true;
-    unsigned NOps = N->getNumOperands();
-    for (unsigned i = 0; i < NOps; i++) {
-      // XXX: Why is this here?
-      if (isa<RegisterSDNode>(N->getOperand(i))) {
-        IsRegSeq = false;
-        break;
-      }
-      RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
-      RegSeqArgs[1 + (2 * i) + 1] =
-              CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL,
-                                        MVT::i32);
-    }
-
-    if (NOps != NumVectorElts) {
-      // Fill in the missing undef elements if this was a scalar_to_vector.
-      assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
-
-      MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                     DL, EltVT);
-      for (unsigned i = NOps; i < NumVectorElts; ++i) {
-        RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
-        RegSeqArgs[1 + (2 * i) + 1] =
-          CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32);
-      }
-    }
-
-    if (!IsRegSeq)
-      break;
-    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
-                                RegSeqArgs);
-  }
-  case ISD::BUILD_PAIR: {
-    SDValue RC, SubReg0, SubReg1;
-    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-      break;
-    }
-    SDLoc DL(N);
-    if (N->getValueType(0) == MVT::i128) {
-      RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32);
-      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
-      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
-    } else if (N->getValueType(0) == MVT::i64) {
-      RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
-      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
-      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
-    } else {
-      llvm_unreachable("Unhandled value type for BUILD_PAIR");
-    }
-    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
-                            N->getOperand(1), SubReg1 };
-    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
-                                  DL, N->getValueType(0), Ops);
-  }
-
-  case ISD::Constant:
-  case ISD::ConstantFP: {
-    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-        N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
-      break;
-
-    uint64_t Imm;
-    if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
-      Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
-    else {
-      ConstantSDNode *C = cast<ConstantSDNode>(N);
-      Imm = C->getZExtValue();
-    }
-
-    SDLoc DL(N);
-    SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
-                                CurDAG->getConstant(Imm & 0xFFFFFFFF, DL,
-                                                    MVT::i32));
-    SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
-                                CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
-    const SDValue Ops[] = {
-      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
-      SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
-    };
-
-    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
-                                  N->getValueType(0), Ops);
-  }
-
-  case ISD::LOAD: {
-    LoadSDNode *LD = cast<LoadSDNode>(N);
-    SDLoc SL(N);
-    EVT VT = N->getValueType(0);
-
-    if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) {
-      N = glueCopyToM0(N);
-      break;
-    }
-
-    // To simplify the TableGen patters, we replace all i64 loads with
-    // v2i32 loads.  Alternatively, we could promote i64 loads to v2i32
-    // during DAG legalization, however, so places (ExpandUnalignedLoad)
-    // in the DAG legalizer assume that if i64 is legal, so doing this
-    // promotion early can cause problems.
-
-    SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(),
-                                      LD->getBasePtr(), LD->getMemOperand());
-    SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
-                                      MVT::i64, NewLoad);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1));
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast);
-    SDNode *Load = glueCopyToM0(NewLoad.getNode());
-    SelectCode(Load);
-    N = BitCast.getNode();
-    break;
-  }
-
-  case ISD::STORE: {
-    // Handle i64 stores here for the same reason mentioned above for loads.
-    StoreSDNode *ST = cast<StoreSDNode>(N);
-    SDValue Value = ST->getValue();
-    if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) {
-
-      SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
-                                        MVT::v2i32, Value);
-      SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
-                                          ST->getBasePtr(), ST->getMemOperand());
-
-      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
-
-      if (NewValue.getOpcode() == ISD::BITCAST) {
-        Select(NewStore.getNode());
-        return SelectCode(NewValue.getNode());
-      }
-
-      // getNode() may fold the bitcast if its input was another bitcast.  If that
-      // happens we should only select the new store.
-      N = NewStore.getNode();
-    }
-
-    N = glueCopyToM0(N);
-    break;
-  }
-
-  case AMDGPUISD::REGISTER_LOAD: {
-    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-      break;
-    SDValue Addr, Offset;
-
-    SDLoc DL(N);
-    SelectADDRIndirect(N->getOperand(1), Addr, Offset);
-    const SDValue Ops[] = {
-      Addr,
-      Offset,
-      CurDAG->getTargetConstant(0, DL, MVT::i32),
-      N->getOperand(0),
-    };
-    return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL,
-                                  CurDAG->getVTList(MVT::i32, MVT::i64,
-                                                    MVT::Other),
-                                  Ops);
-  }
-  case AMDGPUISD::REGISTER_STORE: {
-    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-      break;
-    SDValue Addr, Offset;
-    SelectADDRIndirect(N->getOperand(2), Addr, Offset);
-    SDLoc DL(N);
-    const SDValue Ops[] = {
-      N->getOperand(1),
-      Addr,
-      Offset,
-      CurDAG->getTargetConstant(0, DL, MVT::i32),
-      N->getOperand(0),
-    };
-    return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL,
-                                        CurDAG->getVTList(MVT::Other),
-                                        Ops);
-  }
-
-  case AMDGPUISD::BFE_I32:
-  case AMDGPUISD::BFE_U32: {
-    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
-      break;
-
-    // There is a scalar version available, but unlike the vector version which
-    // has a separate operand for the offset and width, the scalar version packs
-    // the width and offset into a single operand. Try to move to the scalar
-    // version if the offsets are constant, so that we can try to keep extended
-    // loads of kernel arguments in SGPRs.
-
-    // TODO: Technically we could try to pattern match scalar bitshifts of
-    // dynamic values, but it's probably not useful.
-    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
-    if (!Offset)
-      break;
-
-    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
-    if (!Width)
-      break;
-
-    bool Signed = Opc == AMDGPUISD::BFE_I32;
-
-    uint32_t OffsetVal = Offset->getZExtValue();
-    uint32_t WidthVal = Width->getZExtValue();
-
-    return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N),
-                    N->getOperand(0), OffsetVal, WidthVal);
-
-  }
-  case AMDGPUISD::DIV_SCALE: {
-    return SelectDIV_SCALE(N);
-  }
-  case ISD::CopyToReg: {
-    const SITargetLowering& Lowering =
-      *static_cast<const SITargetLowering*>(getTargetLowering());
-    Lowering.legalizeTargetIndependentNode(N, *CurDAG);
-    break;
-  }
-  case ISD::ADDRSPACECAST:
-    return SelectAddrSpaceCast(N);
-  case ISD::AND:
-  case ISD::SRL:
-  case ISD::SRA:
-    if (N->getValueType(0) != MVT::i32 ||
-        Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
-      break;
-
-    return SelectS_BFE(N);
-  }
-
-  return SelectCode(N);
-}
-
-
-bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
-  assert(AS != 0 && "Use checkPrivateAddress instead.");
-  if (!Ptr)
-    return false;
-
-  return Ptr->getType()->getPointerAddressSpace() == AS;
-}
-
-bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) {
-  if (Op->getPseudoValue())
-    return true;
-
-  if (PointerType *PT = dyn_cast<PointerType>(Op->getValue()->getType()))
-    return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
-
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
-  const Value *MemVal = N->getMemOperand()->getValue();
-  return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
-          !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
-          !checkType(MemVal, AMDGPUAS::REGION_ADDRESS));
-}
-
-bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const {
-  const Value *MemVal = N->getMemOperand()->getValue();
-  if (CbId == -1)
-    return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS);
-
-  return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
-}
-
-bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
-  if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
-    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-        N->getMemoryVT().bitsLT(MVT::i32))
-      return true;
-
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) const {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isFlatLoad(const  LoadSDNode *N) const {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) const {
-  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
-  MachineMemOperand *MMO = N->getMemOperand();
-  if (checkPrivateAddress(N->getMemOperand())) {
-    if (MMO) {
-      const PseudoSourceValue *PSV = MMO->getPseudoValue();
-      if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
-  if (checkPrivateAddress(N->getMemOperand())) {
-    // Check to make sure we are not a constant pool load or a constant load
-    // that is marked as a private load
-    if (isCPLoad(N) || isConstantLoad(N, -1)) {
-      return false;
-    }
-  }
-
-  const Value *MemVal = N->getMemOperand()->getValue();
-  if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) {
-    return true;
-  }
-  return false;
-}
-
-const char *AMDGPUDAGToDAGISel::getPassName() const {
-  return "AMDGPU DAG->DAG Pattern Instruction Selection";
-}
-
-#ifdef DEBUGTMP
-#undef INT64_C
-#endif
-#undef DEBUGTMP
-
-//===----------------------------------------------------------------------===//
-// Complex Patterns
-//===----------------------------------------------------------------------===//
-
-bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
-                                                         SDValue& IntPtr) {
-  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
-    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
-                                       true);
-    return true;
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
-    SDValue& BaseReg, SDValue &Offset) {
-  if (!isa<ConstantSDNode>(Addr)) {
-    BaseReg = Addr;
-    Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
-    return true;
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
-                                           SDValue &Offset) {
-  ConstantSDNode *IMMOffset;
-
-  if (Addr.getOpcode() == ISD::ADD
-      && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
-      && isInt<16>(IMMOffset->getZExtValue())) {
-
-      Base = Addr.getOperand(0);
-      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
-                                         MVT::i32);
-      return true;
-  // If the pointer address is constant, we can move it to the offset field.
-  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
-             && isInt<16>(IMMOffset->getZExtValue())) {
-    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
-                                  SDLoc(CurDAG->getEntryNode()),
-                                  AMDGPU::ZERO, MVT::i32);
-    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
-                                       MVT::i32);
-    return true;
-  }
-
-  // Default case, no offset
-  Base = Addr;
-  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
-  return true;
-}
-
-bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
-                                            SDValue &Offset) {
-  ConstantSDNode *C;
-  SDLoc DL(Addr);
-
-  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
-    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
-    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
-  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
-            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
-    Base = Addr.getOperand(0);
-    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
-  } else {
-    Base = Addr;
-    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
-  }
-
-  return true;
-}
-
-SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
-  SDLoc DL(N);
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-
-  bool IsAdd = (N->getOpcode() == ISD::ADD);
-
-  SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
-  SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
-
-  SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                       DL, MVT::i32, LHS, Sub0);
-  SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                       DL, MVT::i32, LHS, Sub1);
-
-  SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                       DL, MVT::i32, RHS, Sub0);
-  SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                       DL, MVT::i32, RHS, Sub1);
-
-  SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
-  SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
-
-
-  unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
-  unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
-
-  SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs);
-  SDValue Carry(AddLo, 1);
-  SDNode *AddHi
-    = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32,
-                             SDValue(Hi0, 0), SDValue(Hi1, 0), Carry);
-
-  SDValue Args[5] = {
-    CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
-    SDValue(AddLo,0),
-    Sub0,
-    SDValue(AddHi,0),
-    Sub1,
-  };
-  return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
-}
-
-// We need to handle this here because tablegen doesn't support matching
-// instructions with multiple outputs.
-SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
-  SDLoc SL(N);
-  EVT VT = N->getValueType(0);
-
-  assert(VT == MVT::f32 || VT == MVT::f64);
-
-  unsigned Opc
-    = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
-
-  // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
-  SDValue Ops[8];
-
-  SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
-  SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
-  SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
-  return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
-}
-
-bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
-                                         unsigned OffsetBits) const {
-  if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
-      (OffsetBits == 8 && !isUInt<8>(Offset)))
-    return false;
-
-  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
-    return true;
-
-  // On Southern Islands instruction with a negative base value and an offset
-  // don't seem to work.
-  return CurDAG->SignBitIsZero(Base);
-}
-
-bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
-                                              SDValue &Offset) const {
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    SDValue N0 = Addr.getOperand(0);
-    SDValue N1 = Addr.getOperand(1);
-    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
-    if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
-      // (add n0, c0)
-      Base = N0;
-      Offset = N1;
-      return true;
-    }
-  }
-
-  SDLoc DL(Addr);
-
-  // If we have a constant address, prefer to put the constant into the
-  // offset. This can save moves to load the constant address since multiple
-  // operations can share the zero base address register, and enables merging
-  // into read2 / write2 instructions.
-  if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
-    if (isUInt<16>(CAddr->getZExtValue())) {
-      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
-      MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
-                                 DL, MVT::i32, Zero);
-      Base = SDValue(MovZero, 0);
-      Offset = Addr;
-      return true;
-    }
-  }
-
-  // default case
-  Base = Addr;
-  Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
-  return true;
-}
-
-bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
-                                                   SDValue &Offset0,
-                                                   SDValue &Offset1) const {
-  SDLoc DL(Addr);
-
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    SDValue N0 = Addr.getOperand(0);
-    SDValue N1 = Addr.getOperand(1);
-    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
-    unsigned DWordOffset0 = C1->getZExtValue() / 4;
-    unsigned DWordOffset1 = DWordOffset0 + 1;
-    // (add n0, c0)
-    if (isDSOffsetLegal(N0, DWordOffset1, 8)) {
-      Base = N0;
-      Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
-      Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
-      return true;
-    }
-  }
-
-  if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
-    unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
-    unsigned DWordOffset1 = DWordOffset0 + 1;
-    assert(4 * DWordOffset0 == CAddr->getZExtValue());
-
-    if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) {
-      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
-      MachineSDNode *MovZero
-        = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
-                                 DL, MVT::i32, Zero);
-      Base = SDValue(MovZero, 0);
-      Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
-      Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
-      return true;
-    }
-  }
-
-  // default case
-  Base = Addr;
-  Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
-  Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
-  return true;
-}
-
-static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
-  return isUInt<12>(Imm->getZExtValue());
-}
-
-void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
-                                     SDValue &VAddr, SDValue &SOffset,
-                                     SDValue &Offset, SDValue &Offen,
-                                     SDValue &Idxen, SDValue &Addr64,
-                                     SDValue &GLC, SDValue &SLC,
-                                     SDValue &TFE) const {
-  SDLoc DL(Addr);
-
-  GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
-  SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
-  TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
-
-  Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
-  Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
-  Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
-  SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
-
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    SDValue N0 = Addr.getOperand(0);
-    SDValue N1 = Addr.getOperand(1);
-    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
-
-    if (N0.getOpcode() == ISD::ADD) {
-      // (add (add N2, N3), C1) -> addr64
-      SDValue N2 = N0.getOperand(0);
-      SDValue N3 = N0.getOperand(1);
-      Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
-      Ptr = N2;
-      VAddr = N3;
-    } else {
-
-      // (add N0, C1) -> offset
-      VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
-      Ptr = N0;
-    }
-
-    if (isLegalMUBUFImmOffset(C1)) {
-        Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
-        return;
-    } else if (isUInt<32>(C1->getZExtValue())) {
-      // Illegal offset, store it in soffset.
-      Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
-      SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
-                   CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
-                        0);
-      return;
-    }
-  }
-
-  if (Addr.getOpcode() == ISD::ADD) {
-    // (add N0, N1) -> addr64
-    SDValue N0 = Addr.getOperand(0);
-    SDValue N1 = Addr.getOperand(1);
-    Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
-    Ptr = N0;
-    VAddr = N1;
-    Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
-    return;
-  }
-
-  // default case -> offset
-  VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
-  Ptr = Addr;
-  Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
-
-}
-
-bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
-                                           SDValue &VAddr, SDValue &SOffset,
-                                           SDValue &Offset, SDValue &GLC,
-                                           SDValue &SLC, SDValue &TFE) const {
-  SDValue Ptr, Offen, Idxen, Addr64;
-
-  SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
-              GLC, SLC, TFE);
-
-  ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
-  if (C->getSExtValue()) {
-    SDLoc DL(Addr);
-
-    const SITargetLowering& Lowering =
-      *static_cast<const SITargetLowering*>(getTargetLowering());
-
-    SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
-    return true;
-  }
-
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
-                                           SDValue &VAddr, SDValue &SOffset,
-					   SDValue &Offset,
-					   SDValue &SLC) const {
-  SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
-  SDValue GLC, TFE;
-
-  return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
-}
-
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
-                                            SDValue &VAddr, SDValue &SOffset,
-                                            SDValue &ImmOffset) const {
-
-  SDLoc DL(Addr);
-  MachineFunction &MF = CurDAG->getMachineFunction();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SITargetLowering& Lowering =
-    *static_cast<const SITargetLowering*>(getTargetLowering());
-
-  unsigned ScratchOffsetReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
-  Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
-                                ScratchOffsetReg, MVT::i32);
-  SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32);
-  SDValue ScratchRsrcDword0 =
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0);
-
-  SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32);
-  SDValue ScratchRsrcDword1 =
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0);
-
-  const SDValue RsrcOps[] = {
-      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
-      ScratchRsrcDword0,
-      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      ScratchRsrcDword1,
-      CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
-  };
-  SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                              MVT::v2i32, RsrcOps), 0);
-  Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
-  SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
-      MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
-
-  // (add n0, c1)
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    SDValue N1 = Addr.getOperand(1);
-    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
-
-    if (isLegalMUBUFImmOffset(C1)) {
-      VAddr = Addr.getOperand(0);
-      ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
-      return true;
-    }
-  }
-
-  // (node)
-  VAddr = Addr;
-  ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
-  return true;
-}
-
-bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
-                                           SDValue &SOffset, SDValue &Offset,
-                                           SDValue &GLC, SDValue &SLC,
-                                           SDValue &TFE) const {
-  SDValue Ptr, VAddr, Offen, Idxen, Addr64;
-  const SIInstrInfo *TII =
-    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
-  SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
-              GLC, SLC, TFE);
-
-  if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
-      !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
-      !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
-    uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
-                    APInt::getAllOnesValue(32).getZExtValue(); // Size
-    SDLoc DL(Addr);
-
-    const SITargetLowering& Lowering =
-      *static_cast<const SITargetLowering*>(getTargetLowering());
-
-    SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
-    return true;
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
-                                           SDValue &Soffset, SDValue &Offset,
-                                           SDValue &GLC) const {
-  SDValue SLC, TFE;
-
-  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
-}
-
-// FIXME: This is incorrect and only enough to be able to compile.
-SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
-  AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
-  SDLoc DL(N);
-
-  assert(Subtarget->hasFlatAddressSpace() &&
-         "addrspacecast only supported with flat address space!");
-
-  assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
-          ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
-         "Cannot cast address space to / from constant address!");
-
-  assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
-          ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
-         "Can only cast to / from flat address space!");
-
-  // The flat instructions read the address as the index of the VGPR holding the
-  // address, so casting should just be reinterpreting the base VGPR, so just
-  // insert trunc / bitcast / zext.
-
-  SDValue Src = ASC->getOperand(0);
-  EVT DestVT = ASC->getValueType(0);
-  EVT SrcVT = Src.getValueType();
-
-  unsigned SrcSize = SrcVT.getSizeInBits();
-  unsigned DestSize = DestVT.getSizeInBits();
-
-  if (SrcSize > DestSize) {
-    assert(SrcSize == 64 && DestSize == 32);
-    return CurDAG->getMachineNode(
-      TargetOpcode::EXTRACT_SUBREG,
-      DL,
-      DestVT,
-      Src,
-      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32));
-  }
-
-
-  if (DestSize > SrcSize) {
-    assert(SrcSize == 32 && DestSize == 64);
-
-    // FIXME: This is probably wrong, we should never be defining
-    // a register class with both VGPRs and SGPRs
-    SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL,
-                                           MVT::i32);
-
-    const SDValue Ops[] = {
-      RC,
-      Src,
-      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
-                                     CurDAG->getConstant(0, DL, MVT::i32)), 0),
-      CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
-    };
-
-    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
-                                  DL, N->getValueType(0), Ops);
-  }
-
-  assert(SrcSize == 64 && DestSize == 64);
-  return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode();
-}
-
-SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
-                                     uint32_t Offset, uint32_t Width) {
-  // Transformation function, pack the offset and width of a BFE into
-  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
-  // source, bits [5:0] contain the offset and bits [22:16] the width.
-  uint32_t PackedVal = Offset | (Width << 16);
-  SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
-
-  return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
-}
-
-SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
-  // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
-  // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
-  // Predicate: 0 < b <= c < 32
-
-  const SDValue &Shl = N->getOperand(0);
-  ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
-
-  if (B && C) {
-    uint32_t BVal = B->getZExtValue();
-    uint32_t CVal = C->getZExtValue();
-
-    if (0 < BVal && BVal <= CVal && CVal < 32) {
-      bool Signed = N->getOpcode() == ISD::SRA;
-      unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
-
-      return getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0),
-                      CVal - BVal, 32 - CVal);
-    }
-  }
-  return SelectCode(N);
-}
-
-SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
-  switch (N->getOpcode()) {
-  case ISD::AND:
-    if (N->getOperand(0).getOpcode() == ISD::SRL) {
-      // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
-      // Predicate: isMask(mask)
-      const SDValue &Srl = N->getOperand(0);
-      ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
-      ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
-
-      if (Shift && Mask) {
-        uint32_t ShiftVal = Shift->getZExtValue();
-        uint32_t MaskVal = Mask->getZExtValue();
-
-        if (isMask_32(MaskVal)) {
-          uint32_t WidthVal = countPopulation(MaskVal);
-
-          return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), Srl.getOperand(0),
-                          ShiftVal, WidthVal);
-        }
-      }
-    }
-    break;
-  case ISD::SRL:
-    if (N->getOperand(0).getOpcode() == ISD::AND) {
-      // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
-      // Predicate: isMask(mask >> b)
-      const SDValue &And = N->getOperand(0);
-      ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
-      ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
-
-      if (Shift && Mask) {
-        uint32_t ShiftVal = Shift->getZExtValue();
-        uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
-
-        if (isMask_32(MaskVal)) {
-          uint32_t WidthVal = countPopulation(MaskVal);
-
-          return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), And.getOperand(0),
-                          ShiftVal, WidthVal);
-        }
-      }
-    } else if (N->getOperand(0).getOpcode() == ISD::SHL)
-      return SelectS_BFEFromShifts(N);
-    break;
-  case ISD::SRA:
-    if (N->getOperand(0).getOpcode() == ISD::SHL)
-      return SelectS_BFEFromShifts(N);
-    break;
-  }
-
-  return SelectCode(N);
-}
-
-bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
-                                        SDValue &SrcMods) const {
-
-  unsigned Mods = 0;
-
-  Src = In;
-
-  if (Src.getOpcode() == ISD::FNEG) {
-    Mods |= SISrcMods::NEG;
-    Src = Src.getOperand(0);
-  }
-
-  if (Src.getOpcode() == ISD::FABS) {
-    Mods |= SISrcMods::ABS;
-    Src = Src.getOperand(0);
-  }
-
-  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
-
-  return true;
-}
-
-bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
-                                         SDValue &SrcMods, SDValue &Clamp,
-                                         SDValue &Omod) const {
-  SDLoc DL(In);
-  // FIXME: Handle Clamp and Omod
-  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32);
-  Omod = CurDAG->getTargetConstant(0, DL, MVT::i32);
-
-  return SelectVOP3Mods(In, Src, SrcMods);
-}
-
-bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,
-                                              SDValue &SrcMods,
-                                              SDValue &Omod) const {
-  // FIXME: Handle Omod
-  Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
-
-  return SelectVOP3Mods(In, Src, SrcMods);
-}
-
-bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
-                                                   SDValue &SrcMods,
-                                                   SDValue &Clamp,
-                                                   SDValue &Omod) const {
-  Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
-  return SelectVOP3Mods(In, Src, SrcMods);
-}
-
-void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
-  const AMDGPUTargetLowering& Lowering =
-    *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
-  bool IsModified = false;
-  do {
-    IsModified = false;
-    // Go over all selected nodes and try to fold them a bit more
-    for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
-         E = CurDAG->allnodes_end(); I != E; ++I) {
-
-      SDNode *Node = I;
-
-      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
-      if (!MachineNode)
-        continue;
-
-      SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
-      if (ResNode != Node) {
-        ReplaceUses(Node, ResNode);
-        IsModified = true;
-      }
-    }
-    CurDAG->RemoveDeadNodes();
-  } while (IsModified);
-}
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
deleted file mode 100644
index d56838ec201..00000000000
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ /dev/null
@@ -1,2866 +0,0 @@
-//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief This is the parent TargetLowering class for hardware code gen
-/// targets.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUISelLowering.h"
-#include "AMDGPU.h"
-#include "AMDGPUFrameLowering.h"
-#include "AMDGPUIntrinsicInfo.h"
-#include "AMDGPURegisterInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "R600MachineFunctionInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/DiagnosticPrinter.h"
-
-using namespace llvm;
-
-namespace {
-
-/// Diagnostic information for unimplemented or unsupported feature reporting.
-class DiagnosticInfoUnsupported : public DiagnosticInfo {
-private:
-  const Twine &Description;
-  const Function &Fn;
-
-  static int KindID;
-
-  static int getKindID() {
-    if (KindID == 0)
-      KindID = llvm::getNextAvailablePluginDiagnosticKind();
-    return KindID;
-  }
-
-public:
-  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
-                          DiagnosticSeverity Severity = DS_Error)
-    : DiagnosticInfo(getKindID(), Severity),
-      Description(Desc),
-      Fn(Fn) { }
-
-  const Function &getFunction() const { return Fn; }
-  const Twine &getDescription() const { return Description; }
-
-  void print(DiagnosticPrinter &DP) const override {
-    DP << "unsupported " << getDescription() << " in " << Fn.getName();
-  }
-
-  static bool classof(const DiagnosticInfo *DI) {
-    return DI->getKind() == getKindID();
-  }
-};
-
-int DiagnosticInfoUnsupported::KindID = 0;
-}
-
-
-static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
-                      CCValAssign::LocInfo LocInfo,
-                      ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  unsigned Offset = State.AllocateStack(ValVT.getStoreSize(),
-                                        ArgFlags.getOrigAlign());
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-
-  return true;
-}
-
-#include "AMDGPUGenCallingConv.inc"
-
-// Find a larger type to do a load / store of a vector with.
-EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
-  unsigned StoreSize = VT.getStoreSizeInBits();
-  if (StoreSize <= 32)
-    return EVT::getIntegerVT(Ctx, StoreSize);
-
-  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
-  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
-}
-
-// Type for a vector that will be loaded to.
-EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
-  unsigned StoreSize = VT.getStoreSizeInBits();
-  if (StoreSize <= 32)
-    return EVT::getIntegerVT(Ctx, 32);
-
-  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
-}
-
-AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
-                                           const AMDGPUSubtarget &STI)
-    : TargetLowering(TM), Subtarget(&STI) {
-  setOperationAction(ISD::Constant, MVT::i32, Legal);
-  setOperationAction(ISD::Constant, MVT::i64, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
-
-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::BRIND, MVT::Other, Expand);
-
-  // We need to custom lower some of the intrinsics
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-
-  // Library functions.  These default to Expand, but we have instructions
-  // for them.
-  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
-  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
-  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
-  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
-  setOperationAction(ISD::FABS,   MVT::f32, Legal);
-  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
-  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
-  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
-  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
-
-  setOperationAction(ISD::FROUND, MVT::f32, Custom);
-  setOperationAction(ISD::FROUND, MVT::f64, Custom);
-
-  setOperationAction(ISD::FREM, MVT::f32, Custom);
-  setOperationAction(ISD::FREM, MVT::f64, Custom);
-
-  // v_mad_f32 does not support denormals according to some sources.
-  if (!Subtarget->hasFP32Denormals())
-    setOperationAction(ISD::FMAD, MVT::f32, Legal);
-
-  // Expand to fneg + fadd.
-  setOperationAction(ISD::FSUB, MVT::f64, Expand);
-
-  // Lower floating point store/load to integer store/load to reduce the number
-  // of patterns in tablegen.
-  setOperationAction(ISD::STORE, MVT::f32, Promote);
-  AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
-
-  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
-  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
-
-  setOperationAction(ISD::STORE, MVT::v4f32, Promote);
-  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
-
-  setOperationAction(ISD::STORE, MVT::v8f32, Promote);
-  AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
-
-  setOperationAction(ISD::STORE, MVT::v16f32, Promote);
-  AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
-
-  setOperationAction(ISD::STORE, MVT::f64, Promote);
-  AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
-
-  setOperationAction(ISD::STORE, MVT::v2f64, Promote);
-  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
-
-  // Custom lowering of vector stores is required for local address space
-  // stores.
-  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
-
-  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
-  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
-  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
-
-  // XXX: This can be change to Custom, once ExpandVectorStores can
-  // handle 64-bit stores.
-  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
-
-  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
-  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
-  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
-  setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
-  setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
-
-
-  setOperationAction(ISD::LOAD, MVT::f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
-
-  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
-
-  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
-
-  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
-
-  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
-
-  setOperationAction(ISD::LOAD, MVT::f64, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
-
-  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
-
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
-
-  // There are no 64-bit extloads. These should be done as a 32-bit extload and
-  // an extension to 64-bit.
-  for (MVT VT : MVT::integer_valuetypes()) {
-    setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
-  }
-
-  for (MVT VT : MVT::integer_vector_valuetypes()) {
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
-  }
-
-  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
-
-  if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
-    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
-    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
-    setOperationAction(ISD::FRINT, MVT::f64, Custom);
-    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
-  }
-
-  if (!Subtarget->hasBFI()) {
-    // fcopysign can be done in a single instruction with BFI.
-    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
-    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-  }
-
-  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
-
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
-
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
-
-  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
-  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
-  setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
-  setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
-
-  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-
-  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
-  for (MVT VT : ScalarIntVTs) {
-    setOperationAction(ISD::SREM, VT, Expand);
-    setOperationAction(ISD::SDIV, VT, Expand);
-
-    // GPU does not have divrem function for signed or unsigned.
-    setOperationAction(ISD::SDIVREM, VT, Custom);
-    setOperationAction(ISD::UDIVREM, VT, Custom);
-
-    // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
-    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
-    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
-
-    setOperationAction(ISD::BSWAP, VT, Expand);
-    setOperationAction(ISD::CTTZ, VT, Expand);
-    setOperationAction(ISD::CTLZ, VT, Expand);
-  }
-
-  if (!Subtarget->hasBCNT(32))
-    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
-
-  if (!Subtarget->hasBCNT(64))
-    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
-
-  // The hardware supports 32-bit ROTR, but not ROTL.
-  setOperationAction(ISD::ROTL, MVT::i32, Expand);
-  setOperationAction(ISD::ROTL, MVT::i64, Expand);
-  setOperationAction(ISD::ROTR, MVT::i64, Expand);
-
-  setOperationAction(ISD::MUL, MVT::i64, Expand);
-  setOperationAction(ISD::MULHU, MVT::i64, Expand);
-  setOperationAction(ISD::MULHS, MVT::i64, Expand);
-  setOperationAction(ISD::UDIV, MVT::i32, Expand);
-  setOperationAction(ISD::UREM, MVT::i32, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
-
-  setOperationAction(ISD::SMIN, MVT::i32, Legal);
-  setOperationAction(ISD::UMIN, MVT::i32, Legal);
-  setOperationAction(ISD::SMAX, MVT::i32, Legal);
-  setOperationAction(ISD::UMAX, MVT::i32, Legal);
-
-  if (!Subtarget->hasFFBH())
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
-
-  if (!Subtarget->hasFFBL())
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
-
-  static const MVT::SimpleValueType VectorIntTypes[] = {
-    MVT::v2i32, MVT::v4i32
-  };
-
-  for (MVT VT : VectorIntTypes) {
-    // Expand the following operations for the current type by default.
-    setOperationAction(ISD::ADD,  VT, Expand);
-    setOperationAction(ISD::AND,  VT, Expand);
-    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
-    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
-    setOperationAction(ISD::MUL,  VT, Expand);
-    setOperationAction(ISD::OR,   VT, Expand);
-    setOperationAction(ISD::SHL,  VT, Expand);
-    setOperationAction(ISD::SRA,  VT, Expand);
-    setOperationAction(ISD::SRL,  VT, Expand);
-    setOperationAction(ISD::ROTL, VT, Expand);
-    setOperationAction(ISD::ROTR, VT, Expand);
-    setOperationAction(ISD::SUB,  VT, Expand);
-    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
-    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
-    setOperationAction(ISD::SDIV, VT, Expand);
-    setOperationAction(ISD::UDIV, VT, Expand);
-    setOperationAction(ISD::SREM, VT, Expand);
-    setOperationAction(ISD::UREM, VT, Expand);
-    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
-    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
-    setOperationAction(ISD::SDIVREM, VT, Custom);
-    setOperationAction(ISD::UDIVREM, VT, Custom);
-    setOperationAction(ISD::ADDC, VT, Expand);
-    setOperationAction(ISD::SUBC, VT, Expand);
-    setOperationAction(ISD::ADDE, VT, Expand);
-    setOperationAction(ISD::SUBE, VT, Expand);
-    setOperationAction(ISD::SELECT, VT, Expand);
-    setOperationAction(ISD::VSELECT, VT, Expand);
-    setOperationAction(ISD::SELECT_CC, VT, Expand);
-    setOperationAction(ISD::XOR,  VT, Expand);
-    setOperationAction(ISD::BSWAP, VT, Expand);
-    setOperationAction(ISD::CTPOP, VT, Expand);
-    setOperationAction(ISD::CTTZ, VT, Expand);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
-    setOperationAction(ISD::CTLZ, VT, Expand);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
-    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
-  }
-
-  static const MVT::SimpleValueType FloatVectorTypes[] = {
-    MVT::v2f32, MVT::v4f32
-  };
-
-  for (MVT VT : FloatVectorTypes) {
-    setOperationAction(ISD::FABS, VT, Expand);
-    setOperationAction(ISD::FMINNUM, VT, Expand);
-    setOperationAction(ISD::FMAXNUM, VT, Expand);
-    setOperationAction(ISD::FADD, VT, Expand);
-    setOperationAction(ISD::FCEIL, VT, Expand);
-    setOperationAction(ISD::FCOS, VT, Expand);
-    setOperationAction(ISD::FDIV, VT, Expand);
-    setOperationAction(ISD::FEXP2, VT, Expand);
-    setOperationAction(ISD::FLOG2, VT, Expand);
-    setOperationAction(ISD::FREM, VT, Expand);
-    setOperationAction(ISD::FPOW, VT, Expand);
-    setOperationAction(ISD::FFLOOR, VT, Expand);
-    setOperationAction(ISD::FTRUNC, VT, Expand);
-    setOperationAction(ISD::FMUL, VT, Expand);
-    setOperationAction(ISD::FMA, VT, Expand);
-    setOperationAction(ISD::FRINT, VT, Expand);
-    setOperationAction(ISD::FNEARBYINT, VT, Expand);
-    setOperationAction(ISD::FSQRT, VT, Expand);
-    setOperationAction(ISD::FSIN, VT, Expand);
-    setOperationAction(ISD::FSUB, VT, Expand);
-    setOperationAction(ISD::FNEG, VT, Expand);
-    setOperationAction(ISD::SELECT, VT, Expand);
-    setOperationAction(ISD::VSELECT, VT, Expand);
-    setOperationAction(ISD::SELECT_CC, VT, Expand);
-    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
-    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
-  }
-
-  setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
-  setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
-
-  setTargetDAGCombine(ISD::MUL);
-  setTargetDAGCombine(ISD::SELECT);
-  setTargetDAGCombine(ISD::SELECT_CC);
-  setTargetDAGCombine(ISD::STORE);
-
-  setTargetDAGCombine(ISD::FADD);
-  setTargetDAGCombine(ISD::FSUB);
-
-  setBooleanContents(ZeroOrNegativeOneBooleanContent);
-  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
-
-  setSchedulingPreference(Sched::RegPressure);
-  setJumpIsExpensive(true);
-
-  // SI at least has hardware support for floating point exceptions, but no way
-  // of using or handling them is implemented. They are also optional in OpenCL
-  // (Section 7.3)
-  setHasFloatingPointExceptions(false);
-
-  setSelectIsExpensive(false);
-  PredictableSelectIsExpensive = false;
-
-  // There are no integer divide instructions, and these expand to a pretty
-  // large sequence of instructions.
-  setIntDivIsCheap(false);
-  setPow2SDivIsCheap(false);
-  setFsqrtIsCheap(true);
-
-  // FIXME: Need to really handle these.
-  MaxStoresPerMemcpy  = 4096;
-  MaxStoresPerMemmove = 4096;
-  MaxStoresPerMemset  = 4096;
-}
-
-//===----------------------------------------------------------------------===//
-// Target Information
-//===----------------------------------------------------------------------===//
-
-MVT AMDGPUTargetLowering::getVectorIdxTy() const {
-  return MVT::i32;
-}
-
-bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
-  return true;
-}
-
-// The backend supports 32 and 64 bit floating point immediates.
-// FIXME: Why are we reporting vectors of FP immediates as legal?
-bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-  EVT ScalarVT = VT.getScalarType();
-  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64);
-}
-
-// We don't want to shrink f64 / f32 constants.
-bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
-  EVT ScalarVT = VT.getScalarType();
-  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
-}
-
-bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
-                                                 ISD::LoadExtType,
-                                                 EVT NewVT) const {
-
-  unsigned NewSize = NewVT.getStoreSizeInBits();
-
-  // If we are reducing to a 32-bit load, this is always better.
-  if (NewSize == 32)
-    return true;
-
-  EVT OldVT = N->getValueType(0);
-  unsigned OldSize = OldVT.getStoreSizeInBits();
-
-  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
-  // extloads, so doing one requires using a buffer_load. In cases where we
-  // still couldn't use a scalar load, using the wider load shouldn't really
-  // hurt anything.
-
-  // If the old size already had to be an extload, there's no harm in continuing
-  // to reduce the width.
-  return (OldSize < 32);
-}
-
-bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
-                                                   EVT CastTy) const {
-  if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
-    return true;
-
-  unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
-  unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();
-
-  return ((LScalarSize <= CastScalarSize) ||
-          (CastScalarSize >= 32) ||
-          (LScalarSize < 32));
-}
-
-// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
-// profitable with the expansion for 64-bit since it's generally good to
-// speculate things.
-// FIXME: These should really have the size as a parameter.
-bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
-  return true;
-}
-
-bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
-  return true;
-}
-
-//===---------------------------------------------------------------------===//
-// Target Properties
-//===---------------------------------------------------------------------===//
-
-bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
-  assert(VT.isFloatingPoint());
-  return VT == MVT::f32 || VT == MVT::f64;
-}
-
-bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
-  assert(VT.isFloatingPoint());
-  return VT == MVT::f32 || VT == MVT::f64;
-}
-
-bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
-                                                         unsigned NumElem,
-                                                         unsigned AS) const {
-  return true;
-}
-
-bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
-  // Truncate is just accessing a subregister.
-  return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
-}
-
-bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
-  // Truncate is just accessing a subregister.
-  return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
-         (Dest->getPrimitiveSizeInBits() % 32 == 0);
-}
-
-bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
-  const DataLayout *DL = getDataLayout();
-  unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType());
-  unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType());
-
-  return SrcSize == 32 && DestSize == 64;
-}
-
-bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
-  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
-  // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
-  // this will enable reducing 64-bit operations the 32-bit, which is always
-  // good.
-  return Src == MVT::i32 && Dest == MVT::i64;
-}
-
-bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
-  return isZExtFree(Val.getValueType(), VT2);
-}
-
-bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
-  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
-  // limited number of native 64-bit operations. Shrinking an operation to fit
-  // in a single 32-bit register should always be helpful. As currently used,
-  // this is much less general than the name suggests, and is only used in
-  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
-  // not profitable, and may actually be harmful.
-  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
-}
-
-//===---------------------------------------------------------------------===//
-// TargetLowering Callbacks
-//===---------------------------------------------------------------------===//
-
-void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
-                             const SmallVectorImpl<ISD::InputArg> &Ins) const {
-
-  State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
-}
-
-SDValue AMDGPUTargetLowering::LowerReturn(
-                                     SDValue Chain,
-                                     CallingConv::ID CallConv,
-                                     bool isVarArg,
-                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                     const SmallVectorImpl<SDValue> &OutVals,
-                                     SDLoc DL, SelectionDAG &DAG) const {
-  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
-}
-
-//===---------------------------------------------------------------------===//
-// Target specific lowering
-//===---------------------------------------------------------------------===//
-
-SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
-                                        SmallVectorImpl<SDValue> &InVals) const {
-  SDValue Callee = CLI.Callee;
-  SelectionDAG &DAG = CLI.DAG;
-
-  const Function &Fn = *DAG.getMachineFunction().getFunction();
-
-  StringRef FuncName("<unknown>");
-
-  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
-    FuncName = G->getSymbol();
-  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    FuncName = G->getGlobal()->getName();
-
-  DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
-  DAG.getContext()->diagnose(NoCalls);
-  return SDValue();
-}
-
-SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) {
-  default:
-    Op.getNode()->dump();
-    llvm_unreachable("Custom lowering code for this"
-                     "instruction is not implemented yet!");
-    break;
-  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
-  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
-  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
-  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
-  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
-  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
-  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
-  case ISD::FREM: return LowerFREM(Op, DAG);
-  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
-  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
-  case ISD::FRINT: return LowerFRINT(Op, DAG);
-  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
-  case ISD::FROUND: return LowerFROUND(Op, DAG);
-  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
-  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
-  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
-  case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
-  case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
-  }
-  return Op;
-}
-
-void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
-                                              SmallVectorImpl<SDValue> &Results,
-                                              SelectionDAG &DAG) const {
-  switch (N->getOpcode()) {
-  case ISD::SIGN_EXTEND_INREG:
-    // Different parts of legalization seem to interpret which type of
-    // sign_extend_inreg is the one to check for custom lowering. The extended
-    // from type is what really matters, but some places check for custom
-    // lowering of the result type. This results in trying to use
-    // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
-    // nothing here and let the illegal result integer be handled normally.
-    return;
-  case ISD::LOAD: {
-    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
-    if (!Node)
-      return;
-
-    Results.push_back(SDValue(Node, 0));
-    Results.push_back(SDValue(Node, 1));
-    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
-    // function
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
-    return;
-  }
-  case ISD::STORE: {
-    SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG);
-    if (Lowered.getNode())
-      Results.push_back(Lowered);
-    return;
-  }
-  default:
-    return;
-  }
-}
-
-// FIXME: This implements accesses to initialized globals in the constant
-// address space by copying them to private and accessing that. It does not
-// properly handle illegal types or vectors. The private vector loads are not
-// scalarized, and the illegal scalars hit an assertion. This technique will not
-// work well with large initializers, and this should eventually be
-// removed. Initialized globals should be placed into a data section that the
-// runtime will load into a buffer before the kernel is executed. Uses of the
-// global need to be replaced with a pointer loaded from an implicit kernel
-// argument into this buffer holding the copy of the data, which will remove the
-// need for any of this.
-SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
-                                                       const GlobalValue *GV,
-                                                       const SDValue &InitPtr,
-                                                       SDValue Chain,
-                                                       SelectionDAG &DAG) const {
-  const DataLayout *TD = getDataLayout();
-  SDLoc DL(InitPtr);
-  Type *InitTy = Init->getType();
-
-  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
-    EVT VT = EVT::getEVT(InitTy);
-    PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
-    return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr,
-                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
-                        TD->getPrefTypeAlignment(InitTy));
-  }
-
-  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
-    EVT VT = EVT::getEVT(CFP->getType());
-    PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
-    return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr,
-                 MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
-                 TD->getPrefTypeAlignment(CFP->getType()));
-  }
-
-  if (StructType *ST = dyn_cast<StructType>(InitTy)) {
-    const StructLayout *SL = TD->getStructLayout(ST);
-
-    EVT PtrVT = InitPtr.getValueType();
-    SmallVector<SDValue, 8> Chains;
-
-    for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
-      SDValue Offset = DAG.getConstant(SL->getElementOffset(I), DL, PtrVT);
-      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
-
-      Constant *Elt = Init->getAggregateElement(I);
-      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
-    }
-
-    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
-  }
-
-  if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
-    EVT PtrVT = InitPtr.getValueType();
-
-    unsigned NumElements;
-    if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
-      NumElements = AT->getNumElements();
-    else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
-      NumElements = VT->getNumElements();
-    else
-      llvm_unreachable("Unexpected type");
-
-    unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
-    SmallVector<SDValue, 8> Chains;
-    for (unsigned i = 0; i < NumElements; ++i) {
-      SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT);
-      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
-
-      Constant *Elt = Init->getAggregateElement(i);
-      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
-    }
-
-    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
-  }
-
-  if (isa<UndefValue>(Init)) {
-    EVT VT = EVT::getEVT(InitTy);
-    PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
-    return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
-                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
-                        TD->getPrefTypeAlignment(InitTy));
-  }
-
-  Init->dump();
-  llvm_unreachable("Unhandled constant initializer");
-}
-
-static bool hasDefinedInitializer(const GlobalValue *GV) {
-  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
-  if (!GVar || !GVar->hasInitializer())
-    return false;
-
-  if (isa<UndefValue>(GVar->getInitializer()))
-    return false;
-
-  return true;
-}
-
-SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
-                                                 SDValue Op,
-                                                 SelectionDAG &DAG) const {
-
-  const DataLayout *TD = getDataLayout();
-  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
-  const GlobalValue *GV = G->getGlobal();
-
-  switch (G->getAddressSpace()) {
-  case AMDGPUAS::LOCAL_ADDRESS: {
-    // XXX: What does the value of G->getOffset() mean?
-    assert(G->getOffset() == 0 &&
-         "Do not know what to do with an non-zero offset");
-
-    // TODO: We could emit code to handle the initialization somewhere.
-    if (hasDefinedInitializer(GV))
-      break;
-
-    unsigned Offset;
-    if (MFI->LocalMemoryObjects.count(GV) == 0) {
-      uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
-      Offset = MFI->LDSSize;
-      MFI->LocalMemoryObjects[GV] = Offset;
-      // XXX: Account for alignment?
-      MFI->LDSSize += Size;
-    } else {
-      Offset = MFI->LocalMemoryObjects[GV];
-    }
-
-    return DAG.getConstant(Offset, SDLoc(Op),
-                           getPointerTy(AMDGPUAS::LOCAL_ADDRESS));
-  }
-  case AMDGPUAS::CONSTANT_ADDRESS: {
-    MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
-    Type *EltType = GV->getType()->getElementType();
-    unsigned Size = TD->getTypeAllocSize(EltType);
-    unsigned Alignment = TD->getPrefTypeAlignment(EltType);
-
-    MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS);
-    MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
-
-    int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
-    SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
-
-    const GlobalVariable *Var = cast<GlobalVariable>(GV);
-    if (!Var->hasInitializer()) {
-      // This has no use, but bugpoint will hit it.
-      return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
-    }
-
-    const Constant *Init = Var->getInitializer();
-    SmallVector<SDNode*, 8> WorkList;
-
-    for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
-                              E = DAG.getEntryNode()->use_end(); I != E; ++I) {
-      if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD)
-        continue;
-      WorkList.push_back(*I);
-    }
-    SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG);
-    for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(),
-                                           E = WorkList.end(); I != E; ++I) {
-      SmallVector<SDValue, 8> Ops;
-      Ops.push_back(Chain);
-      for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
-        Ops.push_back((*I)->getOperand(i));
-      }
-      DAG.UpdateNodeOperands(*I, Ops);
-    }
-    return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
-  }
-  }
-
-  const Function &Fn = *DAG.getMachineFunction().getFunction();
-  DiagnosticInfoUnsupported BadInit(Fn,
-                                    "initializer for address space");
-  DAG.getContext()->diagnose(BadInit);
-  return SDValue();
-}
-
-SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  SmallVector<SDValue, 8> Args;
-
-  for (const SDUse &U : Op->ops())
-    DAG.ExtractVectorElements(U.get(), Args);
-
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
-}
-
-SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
-                                                     SelectionDAG &DAG) const {
-
-  SmallVector<SDValue, 8> Args;
-  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-  EVT VT = Op.getValueType();
-  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
-                            VT.getVectorNumElements());
-
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
-}
-
-SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
-                                              SelectionDAG &DAG) const {
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering();
-
-  FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
-
-  unsigned FrameIndex = FIN->getIndex();
-  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
-  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
-                         Op.getValueType());
-}
-
-SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
-    SelectionDAG &DAG) const {
-  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-
-  switch (IntrinsicID) {
-    default: return Op;
-    case AMDGPUIntrinsic::AMDGPU_abs:
-    case AMDGPUIntrinsic::AMDIL_abs: // Legacy name.
-      return LowerIntrinsicIABS(Op, DAG);
-    case AMDGPUIntrinsic::AMDGPU_lrp:
-      return LowerIntrinsicLRP(Op, DAG);
-
-    case AMDGPUIntrinsic::AMDGPU_clamp:
-    case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
-      return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-    case Intrinsic::AMDGPU_div_scale: {
-      // 3rd parameter required to be a constant.
-      const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
-      if (!Param)
-        return DAG.getUNDEF(VT);
-
-      // Translate to the operands expected by the machine instruction. The
-      // first parameter must be the same as the first instruction.
-      SDValue Numerator = Op.getOperand(1);
-      SDValue Denominator = Op.getOperand(2);
-
-      // Note this order is opposite of the machine instruction's operations,
-      // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
-      // intrinsic has the numerator as the first operand to match a normal
-      // division operation.
-
-      SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
-
-      return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
-                         Denominator, Numerator);
-    }
-
-    case Intrinsic::AMDGPU_div_fmas:
-      return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
-                         Op.getOperand(4));
-
-    case Intrinsic::AMDGPU_div_fixup:
-      return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-    case Intrinsic::AMDGPU_trig_preop:
-      return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2));
-
-    case Intrinsic::AMDGPU_rcp:
-      return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
-
-    case Intrinsic::AMDGPU_rsq:
-      return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
-      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
-
-    case Intrinsic::AMDGPU_rsq_clamped:
-      if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-        Type *Type = VT.getTypeForEVT(*DAG.getContext());
-        APFloat Max = APFloat::getLargest(Type->getFltSemantics());
-        APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
-
-        SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
-        SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
-                                  DAG.getConstantFP(Max, DL, VT));
-        return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
-                           DAG.getConstantFP(Min, DL, VT));
-      } else {
-        return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
-      }
-
-    case Intrinsic::AMDGPU_ldexp:
-      return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1),
-                                                   Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDGPU_imax:
-      return DAG.getNode(ISD::SMAX, DL, VT, Op.getOperand(1),
-                                            Op.getOperand(2));
-    case AMDGPUIntrinsic::AMDGPU_umax:
-      return DAG.getNode(ISD::UMAX, DL, VT, Op.getOperand(1),
-                                            Op.getOperand(2));
-    case AMDGPUIntrinsic::AMDGPU_imin:
-      return DAG.getNode(ISD::SMIN, DL, VT, Op.getOperand(1),
-                                            Op.getOperand(2));
-    case AMDGPUIntrinsic::AMDGPU_umin:
-      return DAG.getNode(ISD::UMIN, DL, VT, Op.getOperand(1),
-                                            Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDGPU_umul24:
-      return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDGPU_imul24:
-      return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDGPU_umad24:
-      return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_imad24:
-      return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
-      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
-      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
-      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
-      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDGPU_bfe_i32:
-      return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2),
-                         Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_bfe_u32:
-      return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2),
-                         Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_bfi:
-      return DAG.getNode(AMDGPUISD::BFI, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2),
-                         Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_bfm:
-      return DAG.getNode(AMDGPUISD::BFM, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDGPU_brev:
-      return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
-
-  case Intrinsic::AMDGPU_class:
-    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
-                       Op.getOperand(1), Op.getOperand(2));
-
-    case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
-      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
-      return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
-    case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
-      return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
-  }
-}
-
-///IABS(a) = SMAX(sub(0, a), a)
-SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
-                            Op.getOperand(1));
-
-  return DAG.getNode(ISD::SMAX, DL, VT, Neg, Op.getOperand(1));
-}
-
-/// Linear Interpolation
-/// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
-SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
-                                DAG.getConstantFP(1.0f, DL, MVT::f32),
-                                Op.getOperand(1));
-  SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
-                                                    Op.getOperand(3));
-  return DAG.getNode(ISD::FADD, DL, VT,
-      DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)),
-      OneSubAC);
-}
-
-/// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
-                                                   EVT VT,
-                                                   SDValue LHS,
-                                                   SDValue RHS,
-                                                   SDValue True,
-                                                   SDValue False,
-                                                   SDValue CC,
-                                                   DAGCombinerInfo &DCI) const {
-  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-    return SDValue();
-
-  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
-    return SDValue();
-
-  SelectionDAG &DAG = DCI.DAG;
-  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
-  switch (CCOpcode) {
-  case ISD::SETOEQ:
-  case ISD::SETONE:
-  case ISD::SETUNE:
-  case ISD::SETNE:
-  case ISD::SETUEQ:
-  case ISD::SETEQ:
-  case ISD::SETFALSE:
-  case ISD::SETFALSE2:
-  case ISD::SETTRUE:
-  case ISD::SETTRUE2:
-  case ISD::SETUO:
-  case ISD::SETO:
-    break;
-  case ISD::SETULE:
-  case ISD::SETULT: {
-    if (LHS == True)
-      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
-    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
-  }
-  case ISD::SETOLE:
-  case ISD::SETOLT:
-  case ISD::SETLE:
-  case ISD::SETLT: {
-    // Ordered. Assume ordered for undefined.
-
-    // Only do this after legalization to avoid interfering with other combines
-    // which might occur.
-    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
-        !DCI.isCalledByLegalizer())
-      return SDValue();
-
-    // We need to permute the operands to get the correct NaN behavior. The
-    // selected operand is the second one based on the failing compare with NaN,
-    // so permute it based on the compare type the hardware uses.
-    if (LHS == True)
-      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
-    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
-  }
-  case ISD::SETUGE:
-  case ISD::SETUGT: {
-    if (LHS == True)
-      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
-    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
-  }
-  case ISD::SETGT:
-  case ISD::SETGE:
-  case ISD::SETOGE:
-  case ISD::SETOGT: {
-    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
-        !DCI.isCalledByLegalizer())
-      return SDValue();
-
-    if (LHS == True)
-      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
-    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
-  }
-  case ISD::SETCC_INVALID:
-    llvm_unreachable("Invalid setcc condcode!");
-  }
-  return SDValue();
-}
-
-// FIXME: Remove this when combines added to DAGCombiner.
-SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL,
-                                             EVT VT,
-                                             SDValue LHS,
-                                             SDValue RHS,
-                                             SDValue True,
-                                             SDValue False,
-                                             SDValue CC,
-                                             SelectionDAG &DAG) const {
-  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
-    return SDValue();
-
-  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
-  switch (CCOpcode) {
-  case ISD::SETULE:
-  case ISD::SETULT: {
-    unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  case ISD::SETLE:
-  case ISD::SETLT: {
-    unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  case ISD::SETGT:
-  case ISD::SETGE: {
-    unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  case ISD::SETUGE:
-  case ISD::SETUGT: {
-    unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  default:
-    return SDValue();
-  }
-}
-
-SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  LoadSDNode *Load = cast<LoadSDNode>(Op);
-  EVT MemVT = Load->getMemoryVT();
-  EVT MemEltVT = MemVT.getVectorElementType();
-
-  EVT LoadVT = Op.getValueType();
-  EVT EltVT = LoadVT.getVectorElementType();
-  EVT PtrVT = Load->getBasePtr().getValueType();
-
-  unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
-  SmallVector<SDValue, 8> Loads;
-  SmallVector<SDValue, 8> Chains;
-
-  SDLoc SL(Op);
-  unsigned MemEltSize = MemEltVT.getStoreSize();
-  MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
-                              DAG.getConstant(i * MemEltSize, SL, PtrVT));
-
-    SDValue NewLoad
-      = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
-                       Load->getChain(), Ptr,
-                       SrcValue.getWithOffset(i * MemEltSize),
-                       MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
-                       Load->isInvariant(), Load->getAlignment());
-    Loads.push_back(NewLoad.getValue(0));
-    Chains.push_back(NewLoad.getValue(1));
-  }
-
-  SDValue Ops[] = {
-    DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
-    DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
-  };
-
-  return DAG.getMergeValues(Ops, SL);
-}
-
-SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
-                                              SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-
-  // If this is a 2 element vector, we really want to scalarize and not create
-  // weird 1 element vectors.
-  if (VT.getVectorNumElements() == 2)
-    return ScalarizeVectorLoad(Op, DAG);
-
-  LoadSDNode *Load = cast<LoadSDNode>(Op);
-  SDValue BasePtr = Load->getBasePtr();
-  EVT PtrVT = BasePtr.getValueType();
-  EVT MemVT = Load->getMemoryVT();
-  SDLoc SL(Op);
-  MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
-
-  EVT LoVT, HiVT;
-  EVT LoMemVT, HiMemVT;
-  SDValue Lo, Hi;
-
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
-  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
-  std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
-  SDValue LoLoad
-    = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
-                     Load->getChain(), BasePtr,
-                     SrcValue,
-                     LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
-                     Load->isInvariant(), Load->getAlignment());
-
-  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
-                              DAG.getConstant(LoMemVT.getStoreSize(), SL,
-                                              PtrVT));
-
-  SDValue HiLoad
-    = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
-                     Load->getChain(), HiPtr,
-                     SrcValue.getWithOffset(LoMemVT.getStoreSize()),
-                     HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
-                     Load->isInvariant(), Load->getAlignment());
-
-  SDValue Ops[] = {
-    DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
-    DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
-                LoLoad.getValue(1), HiLoad.getValue(1))
-  };
-
-  return DAG.getMergeValues(Ops, SL);
-}
-
-SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
-                                               SelectionDAG &DAG) const {
-  StoreSDNode *Store = cast<StoreSDNode>(Op);
-  EVT MemVT = Store->getMemoryVT();
-  unsigned MemBits = MemVT.getSizeInBits();
-
-  // Byte stores are really expensive, so if possible, try to pack 32-bit vector
-  // truncating store into an i32 store.
-  // XXX: We could also handle optimize other vector bitwidths.
-  if (!MemVT.isVector() || MemBits > 32) {
-    return SDValue();
-  }
-
-  SDLoc DL(Op);
-  SDValue Value = Store->getValue();
-  EVT VT = Value.getValueType();
-  EVT ElemVT = VT.getVectorElementType();
-  SDValue Ptr = Store->getBasePtr();
-  EVT MemEltVT = MemVT.getVectorElementType();
-  unsigned MemEltBits = MemEltVT.getSizeInBits();
-  unsigned MemNumElements = MemVT.getVectorNumElements();
-  unsigned PackedSize = MemVT.getStoreSizeInBits();
-  SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, DL, MVT::i32);
-
-  assert(Value.getValueType().getScalarSizeInBits() >= 32);
-
-  SDValue PackedValue;
-  for (unsigned i = 0; i < MemNumElements; ++i) {
-    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
-                              DAG.getConstant(i, DL, MVT::i32));
-    Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
-    Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
-
-    SDValue Shift = DAG.getConstant(MemEltBits * i, DL, MVT::i32);
-    Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
-
-    if (i == 0) {
-      PackedValue = Elt;
-    } else {
-      PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
-    }
-  }
-
-  if (PackedSize < 32) {
-    EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
-    return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
-                             Store->getMemOperand()->getPointerInfo(),
-                             PackedVT,
-                             Store->isNonTemporal(), Store->isVolatile(),
-                             Store->getAlignment());
-  }
-
-  return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
-                      Store->getMemOperand()->getPointerInfo(),
-                      Store->isVolatile(),  Store->isNonTemporal(),
-                      Store->getAlignment());
-}
-
-SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op,
-                                                   SelectionDAG &DAG) const {
-  StoreSDNode *Store = cast<StoreSDNode>(Op);
-  EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
-  EVT EltVT = Store->getValue().getValueType().getVectorElementType();
-  EVT PtrVT = Store->getBasePtr().getValueType();
-  unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
-  SDLoc SL(Op);
-
-  SmallVector<SDValue, 8> Chains;
-
-  unsigned EltSize = MemEltVT.getStoreSize();
-  MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
-
-  for (unsigned i = 0, e = NumElts; i != e; ++i) {
-    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
-                              Store->getValue(),
-                              DAG.getConstant(i, SL, MVT::i32));
-
-    SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), SL, PtrVT);
-    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset);
-    SDValue NewStore =
-      DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
-                        SrcValue.getWithOffset(i * EltSize),
-                        MemEltVT, Store->isNonTemporal(), Store->isVolatile(),
-                        Store->getAlignment());
-    Chains.push_back(NewStore);
-  }
-
-  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
-}
-
-SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  StoreSDNode *Store = cast<StoreSDNode>(Op);
-  SDValue Val = Store->getValue();
-  EVT VT = Val.getValueType();
-
-  // If this is a 2 element vector, we really want to scalarize and not create
-  // weird 1 element vectors.
-  if (VT.getVectorNumElements() == 2)
-    return ScalarizeVectorStore(Op, DAG);
-
-  EVT MemVT = Store->getMemoryVT();
-  SDValue Chain = Store->getChain();
-  SDValue BasePtr = Store->getBasePtr();
-  SDLoc SL(Op);
-
-  EVT LoVT, HiVT;
-  EVT LoMemVT, HiMemVT;
-  SDValue Lo, Hi;
-
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
-  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
-  std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
-
-  EVT PtrVT = BasePtr.getValueType();
-  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
-                              DAG.getConstant(LoMemVT.getStoreSize(), SL,
-                                              PtrVT));
-
-  MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
-  SDValue LoStore
-    = DAG.getTruncStore(Chain, SL, Lo,
-                        BasePtr,
-                        SrcValue,
-                        LoMemVT,
-                        Store->isNonTemporal(),
-                        Store->isVolatile(),
-                        Store->getAlignment());
-  SDValue HiStore
-    = DAG.getTruncStore(Chain, SL, Hi,
-                        HiPtr,
-                        SrcValue.getWithOffset(LoMemVT.getStoreSize()),
-                        HiMemVT,
-                        Store->isNonTemporal(),
-                        Store->isVolatile(),
-                        Store->getAlignment());
-
-  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
-}
-
-
-SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  LoadSDNode *Load = cast<LoadSDNode>(Op);
-  ISD::LoadExtType ExtType = Load->getExtensionType();
-  EVT VT = Op.getValueType();
-  EVT MemVT = Load->getMemoryVT();
-
-  if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
-    assert(VT == MVT::i1 && "Only i1 non-extloads expected");
-    // FIXME: Copied from PPC
-    // First, load into 32 bits, then truncate to 1 bit.
-
-    SDValue Chain = Load->getChain();
-    SDValue BasePtr = Load->getBasePtr();
-    MachineMemOperand *MMO = Load->getMemOperand();
-
-    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
-                                   BasePtr, MVT::i8, MMO);
-
-    SDValue Ops[] = {
-      DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
-      NewLD.getValue(1)
-    };
-
-    return DAG.getMergeValues(Ops, DL);
-  }
-
-  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-      Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
-      ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
-    return SDValue();
-
-  // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
-  // register (2-)byte extract.
-
-  // Get Register holding the target.
-  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
-                            DAG.getConstant(2, DL, MVT::i32));
-  // Load the Register.
-  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
-                            Load->getChain(), Ptr,
-                            DAG.getTargetConstant(0, DL, MVT::i32),
-                            Op.getOperand(2));
-
-  // Get offset within the register.
-  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
-                                Load->getBasePtr(),
-                                DAG.getConstant(0x3, DL, MVT::i32));
-
-  // Bit offset of target byte (byteIdx * 8).
-  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
-                                 DAG.getConstant(3, DL, MVT::i32));
-
-  // Shift to the right.
-  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
-
-  // Eliminate the upper bits by setting them to ...
-  EVT MemEltVT = MemVT.getScalarType();
-
-  // ... ones.
-  if (ExtType == ISD::SEXTLOAD) {
-    SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
-
-    SDValue Ops[] = {
-      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
-      Load->getChain()
-    };
-
-    return DAG.getMergeValues(Ops, DL);
-  }
-
-  // ... or zeros.
-  SDValue Ops[] = {
-    DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
-    Load->getChain()
-  };
-
-  return DAG.getMergeValues(Ops, DL);
-}
-
-SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
-  if (Result.getNode()) {
-    return Result;
-  }
-
-  StoreSDNode *Store = cast<StoreSDNode>(Op);
-  SDValue Chain = Store->getChain();
-  if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
-       Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
-      Store->getValue().getValueType().isVector()) {
-    return ScalarizeVectorStore(Op, DAG);
-  }
-
-  EVT MemVT = Store->getMemoryVT();
-  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
-      MemVT.bitsLT(MVT::i32)) {
-    unsigned Mask = 0;
-    if (Store->getMemoryVT() == MVT::i8) {
-      Mask = 0xff;
-    } else if (Store->getMemoryVT() == MVT::i16) {
-      Mask = 0xffff;
-    }
-    SDValue BasePtr = Store->getBasePtr();
-    SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
-                              DAG.getConstant(2, DL, MVT::i32));
-    SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
-                              Chain, Ptr,
-                              DAG.getTargetConstant(0, DL, MVT::i32));
-
-    SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
-                                  DAG.getConstant(0x3, DL, MVT::i32));
-
-    SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
-                                   DAG.getConstant(3, DL, MVT::i32));
-
-    SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
-                                    Store->getValue());
-
-    SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
-
-    SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
-                                       MaskedValue, ShiftAmt);
-
-    SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
-                                  DAG.getConstant(Mask, DL, MVT::i32),
-                                  ShiftAmt);
-    DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
-                          DAG.getConstant(0xffffffff, DL, MVT::i32));
-    Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
-
-    SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
-    return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
-                       Chain, Value, Ptr,
-                       DAG.getTargetConstant(0, DL, MVT::i32));
-  }
-  return SDValue();
-}
-
-// This is a shortcut for integer division because we have fast i32<->f32
-// conversions, and fast f32 reciprocal instructions. The fractional part of a
-// float is enough to accurately represent up to a 24-bit integer.
-SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  MVT IntVT = MVT::i32;
-  MVT FltVT = MVT::f32;
-
-  ISD::NodeType ToFp  = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
-  ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
-
-  if (VT.isVector()) {
-    unsigned NElts = VT.getVectorNumElements();
-    IntVT = MVT::getVectorVT(MVT::i32, NElts);
-    FltVT = MVT::getVectorVT(MVT::f32, NElts);
-  }
-
-  unsigned BitSize = VT.getScalarType().getSizeInBits();
-
-  SDValue jq = DAG.getConstant(1, DL, IntVT);
-
-  if (sign) {
-    // char|short jq = ia ^ ib;
-    jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
-
-    // jq = jq >> (bitsize - 2)
-    jq = DAG.getNode(ISD::SRA, DL, VT, jq,
-                     DAG.getConstant(BitSize - 2, DL, VT));
-
-    // jq = jq | 0x1
-    jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
-
-    // jq = (int)jq
-    jq = DAG.getSExtOrTrunc(jq, DL, IntVT);
-  }
-
-  // int ia = (int)LHS;
-  SDValue ia = sign ?
-    DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT);
-
-  // int ib, (int)RHS;
-  SDValue ib = sign ?
-    DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT);
-
-  // float fa = (float)ia;
-  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
-
-  // float fb = (float)ib;
-  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
-
-  // float fq = native_divide(fa, fb);
-  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
-                           fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
-
-  // fq = trunc(fq);
-  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
-
-  // float fqneg = -fq;
-  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
-
-  // float fr = mad(fqneg, fb, fa);
-  SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT,
-                           DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa);
-
-  // int iq = (int)fq;
-  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
-
-  // fr = fabs(fr);
-  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
-
-  // fb = fabs(fb);
-  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
-
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT);
-
-  // int cv = fr >= fb;
-  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
-
-  // jq = (cv ? jq : 0);
-  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
-
-  // dst = trunc/extend to legal type
-  iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT);
-
-  // dst = iq + jq;
-  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
-
-  // Rem needs compensation, it's easier to recompute it
-  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
-  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
-
-  SDValue Res[2] = {
-    Div,
-    Rem
-  };
-  return DAG.getMergeValues(Res, DL);
-}
-
-void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
-                                      SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &Results) const {
-  assert(Op.getValueType() == MVT::i64);
-
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
-
-  SDValue one = DAG.getConstant(1, DL, HalfVT);
-  SDValue zero = DAG.getConstant(0, DL, HalfVT);
-
-  //HiLo split
-  SDValue LHS = Op.getOperand(0);
-  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
-  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
-
-  SDValue RHS = Op.getOperand(1);
-  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
-  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
-
-  if (VT == MVT::i64 &&
-    DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
-    DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
-
-    SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
-                              LHS_Lo, RHS_Lo);
-
-    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero);
-    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero);
-    Results.push_back(DIV);
-    Results.push_back(REM);
-    return;
-  }
-
-  // Get Speculative values
-  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
-  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
-
-  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
-  SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero);
-
-  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
-  SDValue DIV_Lo = zero;
-
-  const unsigned halfBitWidth = HalfVT.getSizeInBits();
-
-  for (unsigned i = 0; i < halfBitWidth; ++i) {
-    const unsigned bitPos = halfBitWidth - i - 1;
-    SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
-    // Get value of high bit
-    SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
-    HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
-    HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
-
-    // Shift
-    REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
-    // Add LHS high bit
-    REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
-
-    SDValue BIT = DAG.getConstant(1 << bitPos, DL, HalfVT);
-    SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
-
-    DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
-
-    // Update REM
-    SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
-    REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
-  }
-
-  SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
-  Results.push_back(DIV);
-  Results.push_back(REM);
-}
-
-SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-
-  if (VT == MVT::i64) {
-    SmallVector<SDValue, 2> Results;
-    LowerUDIVREM64(Op, DAG, Results);
-    return DAG.getMergeValues(Results, DL);
-  }
-
-  SDValue Num = Op.getOperand(0);
-  SDValue Den = Op.getOperand(1);
-
-  if (VT == MVT::i32) {
-    if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) &&
-        DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) {
-      // TODO: We technically could do this for i64, but shouldn't that just be
-      // handled by something generally reducing 64-bit division on 32-bit
-      // values to 32-bit?
-      return LowerDIVREM24(Op, DAG, false);
-    }
-  }
-
-  // RCP =  URECIP(Den) = 2^32 / Den + e
-  // e is rounding error.
-  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
-
-  // RCP_LO = mul(RCP, Den) */
-  SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
-
-  // RCP_HI = mulhu (RCP, Den) */
-  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
-
-  // NEG_RCP_LO = -RCP_LO
-  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
-                                                     RCP_LO);
-
-  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
-  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
-                                           NEG_RCP_LO, RCP_LO,
-                                           ISD::SETEQ);
-  // Calculate the rounding error from the URECIP instruction
-  // E = mulhu(ABS_RCP_LO, RCP)
-  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
-
-  // RCP_A_E = RCP + E
-  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
-
-  // RCP_S_E = RCP - E
-  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
-
-  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
-  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
-                                     RCP_A_E, RCP_S_E,
-                                     ISD::SETEQ);
-  // Quotient = mulhu(Tmp0, Num)
-  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
-
-  // Num_S_Remainder = Quotient * Den
-  SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
-
-  // Remainder = Num - Num_S_Remainder
-  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
-
-  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
-  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
-                                                 DAG.getConstant(-1, DL, VT),
-                                                 DAG.getConstant(0, DL, VT),
-                                                 ISD::SETUGE);
-  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
-  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
-                                                  Num_S_Remainder,
-                                                  DAG.getConstant(-1, DL, VT),
-                                                  DAG.getConstant(0, DL, VT),
-                                                  ISD::SETUGE);
-  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
-  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
-                                               Remainder_GE_Zero);
-
-  // Calculate Division result:
-
-  // Quotient_A_One = Quotient + 1
-  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
-                                       DAG.getConstant(1, DL, VT));
-
-  // Quotient_S_One = Quotient - 1
-  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
-                                       DAG.getConstant(1, DL, VT));
-
-  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
-  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
-                                     Quotient, Quotient_A_One, ISD::SETEQ);
-
-  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
-  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
-                            Quotient_S_One, Div, ISD::SETEQ);
-
-  // Calculate Rem result:
-
-  // Remainder_S_Den = Remainder - Den
-  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
-
-  // Remainder_A_Den = Remainder + Den
-  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
-
-  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
-  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
-                                    Remainder, Remainder_S_Den, ISD::SETEQ);
-
-  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
-  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
-                            Remainder_A_Den, Rem, ISD::SETEQ);
-  SDValue Ops[2] = {
-    Div,
-    Rem
-  };
-  return DAG.getMergeValues(Ops, DL);
-}
-
-SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-
-  SDValue Zero = DAG.getConstant(0, DL, VT);
-  SDValue NegOne = DAG.getConstant(-1, DL, VT);
-
-  if (VT == MVT::i32 &&
-      DAG.ComputeNumSignBits(LHS) > 8 &&
-      DAG.ComputeNumSignBits(RHS) > 8) {
-    return LowerDIVREM24(Op, DAG, true);
-  }
-  if (VT == MVT::i64 &&
-      DAG.ComputeNumSignBits(LHS) > 32 &&
-      DAG.ComputeNumSignBits(RHS) > 32) {
-    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
-
-    //HiLo split
-    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
-    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
-    SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
-                                 LHS_Lo, RHS_Lo);
-    SDValue Res[2] = {
-      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
-      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
-    };
-    return DAG.getMergeValues(Res, DL);
-  }
-
-  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
-  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
-  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
-  SDValue RSign = LHSign; // Remainder sign is the same as LHS
-
-  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
-  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
-
-  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
-  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
-
-  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
-  SDValue Rem = Div.getValue(1);
-
-  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
-  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
-
-  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
-  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
-
-  SDValue Res[2] = {
-    Div,
-    Rem
-  };
-  return DAG.getMergeValues(Res, DL);
-}
-
-// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
-SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc SL(Op);
-  EVT VT = Op.getValueType();
-  SDValue X = Op.getOperand(0);
-  SDValue Y = Op.getOperand(1);
-
-  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
-  SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
-  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
-
-  return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
-}
-
-SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc SL(Op);
-  SDValue Src = Op.getOperand(0);
-
-  // result = trunc(src)
-  // if (src > 0.0 && src != result)
-  //   result += 1.0
-
-  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
-
-  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
-  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
-
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
-
-  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
-  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
-  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
-
-  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
-  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
-}
-
-static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) {
-  const unsigned FractBits = 52;
-  const unsigned ExpBits = 11;
-
-  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
-                                Hi,
-                                DAG.getConstant(FractBits - 32, SL, MVT::i32),
-                                DAG.getConstant(ExpBits, SL, MVT::i32));
-  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
-                            DAG.getConstant(1023, SL, MVT::i32));
-
-  return Exp;
-}
-
-SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc SL(Op);
-  SDValue Src = Op.getOperand(0);
-
-  assert(Op.getValueType() == MVT::f64);
-
-  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
-  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
-
-  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
-
-  // Extract the upper half, since this is where we will find the sign and
-  // exponent.
-  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
-
-  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
-
-  const unsigned FractBits = 52;
-
-  // Extract the sign bit.
-  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
-  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
-
-  // Extend back to to 64-bits.
-  SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
-                                  Zero, SignBit);
-  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
-
-  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
-  const SDValue FractMask
-    = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
-
-  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
-  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
-  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
-
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
-
-  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
-
-  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
-  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
-
-  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
-  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
-
-  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
-}
-
-SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc SL(Op);
-  SDValue Src = Op.getOperand(0);
-
-  assert(Op.getValueType() == MVT::f64);
-
-  APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52");
-  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
-  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
-
-  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
-  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
-
-  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
-
-  APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
-  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
-
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
-  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
-
-  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
-}
-
-SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
-  // FNEARBYINT and FRINT are the same, except in their handling of FP
-  // exceptions. Those aren't really meaningful for us, and OpenCL only has
-  // rint, so just treat them as equivalent.
-  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
-}
-
-// XXX - May require not supporting f32 denormals?
-SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc SL(Op);
-  SDValue X = Op.getOperand(0);
-
-  SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
-
-  SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
-
-  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
-
-  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32);
-  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
-  const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32);
-
-  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
-
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
-
-  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
-
-  SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
-
-  return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
-}
-
-SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc SL(Op);
-  SDValue X = Op.getOperand(0);
-
-  SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
-
-  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
-  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
-  const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
-  const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
-
-
-  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
-
-  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
-
-  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
-
-  const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
-                                       MVT::i64);
-
-  SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
-  SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
-                          DAG.getConstant(INT64_C(0x0008000000000000), SL,
-                                          MVT::i64),
-                          Exp);
-
-  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
-  SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
-                              DAG.getConstant(0, SL, MVT::i64), Tmp0,
-                              ISD::SETNE);
-
-  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
-                             D, DAG.getConstant(0, SL, MVT::i64));
-  SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
-
-  K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
-  K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
-
-  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
-  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
-  SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
-
-  SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
-                            ExpEqNegOne,
-                            DAG.getConstantFP(1.0, SL, MVT::f64),
-                            DAG.getConstantFP(0.0, SL, MVT::f64));
-
-  SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
-
-  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
-  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
-
-  return K;
-}
-
-SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-
-  if (VT == MVT::f32)
-    return LowerFROUND32(Op, DAG);
-
-  if (VT == MVT::f64)
-    return LowerFROUND64(Op, DAG);
-
-  llvm_unreachable("unhandled type");
-}
-
-SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc SL(Op);
-  SDValue Src = Op.getOperand(0);
-
-  // result = trunc(src);
-  // if (src < 0.0 && src != result)
-  //   result += -1.0.
-
-  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
-
-  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
-  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
-
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
-
-  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
-  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
-  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
-
-  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
-  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
-}
-
-SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
-                                               bool Signed) const {
-  SDLoc SL(Op);
-  SDValue Src = Op.getOperand(0);
-
-  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
-
-  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
-                           DAG.getConstant(0, SL, MVT::i32));
-  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
-                           DAG.getConstant(1, SL, MVT::i32));
-
-  SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
-                              SL, MVT::f64, Hi);
-
-  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
-
-  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
-                              DAG.getConstant(32, SL, MVT::i32));
-
-  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
-}
-
-SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  SDValue S0 = Op.getOperand(0);
-  if (S0.getValueType() != MVT::i64)
-    return SDValue();
-
-  EVT DestVT = Op.getValueType();
-  if (DestVT == MVT::f64)
-    return LowerINT_TO_FP64(Op, DAG, false);
-
-  assert(DestVT == MVT::f32);
-
-  SDLoc DL(Op);
-
-  // f32 uint_to_fp i64
-  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
-                           DAG.getConstant(0, DL, MVT::i32));
-  SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
-  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
-                           DAG.getConstant(1, DL, MVT::i32));
-  SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
-  FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
-                        DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32
-  return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
-}
-
-SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
-                                              SelectionDAG &DAG) const {
-  SDValue Src = Op.getOperand(0);
-  if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64)
-    return LowerINT_TO_FP64(Op, DAG, true);
-
-  return SDValue();
-}
-
-SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
-                                               bool Signed) const {
-  SDLoc SL(Op);
-
-  SDValue Src = Op.getOperand(0);
-
-  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
-
-  SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
-                                 MVT::f64);
-  SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
-                                 MVT::f64);
-
-  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
-
-  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
-
-
-  SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
-
-  SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
-                           MVT::i32, FloorMul);
-  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
-
-  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi);
-
-  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
-}
-
-SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
-                                              SelectionDAG &DAG) const {
-  SDValue Src = Op.getOperand(0);
-
-  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
-    return LowerFP64_TO_INT(Op, DAG, true);
-
-  return SDValue();
-}
-
-SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
-                                              SelectionDAG &DAG) const {
-  SDValue Src = Op.getOperand(0);
-
-  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
-    return LowerFP64_TO_INT(Op, DAG, false);
-
-  return SDValue();
-}
-
-SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
-                                                     SelectionDAG &DAG) const {
-  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
-  MVT VT = Op.getSimpleValueType();
-  MVT ScalarVT = VT.getScalarType();
-
-  if (!VT.isVector())
-    return SDValue();
-
-  SDValue Src = Op.getOperand(0);
-  SDLoc DL(Op);
-
-  // TODO: Don't scalarize on Evergreen?
-  unsigned NElts = VT.getVectorNumElements();
-  SmallVector<SDValue, 8> Args;
-  DAG.ExtractVectorElements(Src, Args, 0, NElts);
-
-  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
-  for (unsigned I = 0; I < NElts; ++I)
-    Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
-
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
-}
-
-//===----------------------------------------------------------------------===//
-// Custom DAG optimizations
-//===----------------------------------------------------------------------===//
-
-static bool isU24(SDValue Op, SelectionDAG &DAG) {
-  APInt KnownZero, KnownOne;
-  EVT VT = Op.getValueType();
-  DAG.computeKnownBits(Op, KnownZero, KnownOne);
-
-  return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
-}
-
-static bool isI24(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
-
-  // In order for this to be a signed 24-bit value, bit 23, must
-  // be a sign bit.
-  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
-                                     // as unsigned 24-bit values.
-         (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
-}
-
-static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
-
-  SelectionDAG &DAG = DCI.DAG;
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT VT = Op.getValueType();
-
-  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
-  APInt KnownZero, KnownOne;
-  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
-  if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
-    DCI.CommitTargetLoweringOpt(TLO);
-}
-
-template <typename IntTy>
-static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
-                               uint32_t Offset, uint32_t Width, SDLoc DL) {
-  if (Width + Offset < 32) {
-    uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
-    IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
-    return DAG.getConstant(Result, DL, MVT::i32);
-  }
-
-  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
-}
-
-static bool usesAllNormalStores(SDNode *LoadVal) {
-  for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) {
-    if (!ISD::isNormalStore(*I))
-      return false;
-  }
-
-  return true;
-}
-
-// If we have a copy of an illegal type, replace it with a load / store of an
-// equivalently sized legal type. This avoids intermediate bit pack / unpack
-// instructions emitted when handling extloads and truncstores. Ideally we could
-// recognize the pack / unpack pattern to eliminate it.
-SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
-                                                  DAGCombinerInfo &DCI) const {
-  if (!DCI.isBeforeLegalize())
-    return SDValue();
-
-  StoreSDNode *SN = cast<StoreSDNode>(N);
-  SDValue Value = SN->getValue();
-  EVT VT = Value.getValueType();
-
-  if (isTypeLegal(VT) || SN->isVolatile() ||
-      !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8)
-    return SDValue();
-
-  LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
-  if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal))
-    return SDValue();
-
-  EVT MemVT = LoadVal->getMemoryVT();
-
-  SDLoc SL(N);
-  SelectionDAG &DAG = DCI.DAG;
-  EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT);
-
-  SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
-                                LoadVT, SL,
-                                LoadVal->getChain(),
-                                LoadVal->getBasePtr(),
-                                LoadVal->getOffset(),
-                                LoadVT,
-                                LoadVal->getMemOperand());
-
-  SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0));
-  DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false);
-
-  return DAG.getStore(SN->getChain(), SL, NewLoad,
-                      SN->getBasePtr(), SN->getMemOperand());
-}
-
-SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
-                                                DAGCombinerInfo &DCI) const {
-  EVT VT = N->getValueType(0);
-
-  if (VT.isVector() || VT.getSizeInBits() > 32)
-    return SDValue();
-
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDValue Mul;
-
-  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
-    N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
-    N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
-    Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
-  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
-    N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
-    N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
-    Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
-  } else {
-    return SDValue();
-  }
-
-  // We need to use sext even for MUL_U24, because MUL_U24 is used
-  // for signed multiply of 8 and 16-bit types.
-  return DAG.getSExtOrTrunc(Mul, DL, VT);
-}
-
-SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
-                                                DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-
-  switch(N->getOpcode()) {
-    default: break;
-    case ISD::MUL:
-      return performMulCombine(N, DCI);
-    case AMDGPUISD::MUL_I24:
-    case AMDGPUISD::MUL_U24: {
-      SDValue N0 = N->getOperand(0);
-      SDValue N1 = N->getOperand(1);
-      simplifyI24(N0, DCI);
-      simplifyI24(N1, DCI);
-      return SDValue();
-    }
-  case ISD::SELECT: {
-    SDValue Cond = N->getOperand(0);
-    if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) {
-      EVT VT = N->getValueType(0);
-      SDValue LHS = Cond.getOperand(0);
-      SDValue RHS = Cond.getOperand(1);
-      SDValue CC = Cond.getOperand(2);
-
-      SDValue True = N->getOperand(1);
-      SDValue False = N->getOperand(2);
-
-      if (VT == MVT::f32)
-        return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
-
-      // TODO: Implement min / max Evergreen instructions.
-      if (VT == MVT::i32 &&
-          Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-        return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
-      }
-    }
-
-    break;
-  }
-  case AMDGPUISD::BFE_I32:
-  case AMDGPUISD::BFE_U32: {
-    assert(!N->getValueType(0).isVector() &&
-           "Vector handling of BFE not implemented");
-    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
-    if (!Width)
-      break;
-
-    uint32_t WidthVal = Width->getZExtValue() & 0x1f;
-    if (WidthVal == 0)
-      return DAG.getConstant(0, DL, MVT::i32);
-
-    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
-    if (!Offset)
-      break;
-
-    SDValue BitsFrom = N->getOperand(0);
-    uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
-
-    bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
-
-    if (OffsetVal == 0) {
-      // This is already sign / zero extended, so try to fold away extra BFEs.
-      unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
-
-      unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
-      if (OpSignBits >= SignBits)
-        return BitsFrom;
-
-      EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
-      if (Signed) {
-        // This is a sign_extend_inreg. Replace it to take advantage of existing
-        // DAG Combines. If not eliminated, we will match back to BFE during
-        // selection.
-
-        // TODO: The sext_inreg of extended types ends, although we can could
-        // handle them in a single BFE.
-        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
-                           DAG.getValueType(SmallVT));
-      }
-
-      return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
-    }
-
-    if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
-      if (Signed) {
-        return constantFoldBFE<int32_t>(DAG,
-                                        CVal->getSExtValue(),
-                                        OffsetVal,
-                                        WidthVal,
-                                        DL);
-      }
-
-      return constantFoldBFE<uint32_t>(DAG,
-                                       CVal->getZExtValue(),
-                                       OffsetVal,
-                                       WidthVal,
-                                       DL);
-    }
-
-    if ((OffsetVal + WidthVal) >= 32) {
-      SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
-      return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
-                         BitsFrom, ShiftVal);
-    }
-
-    if (BitsFrom.hasOneUse()) {
-      APInt Demanded = APInt::getBitsSet(32,
-                                         OffsetVal,
-                                         OffsetVal + WidthVal);
-
-      APInt KnownZero, KnownOne;
-      TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
-                                            !DCI.isBeforeLegalizeOps());
-      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
-          TLI.SimplifyDemandedBits(BitsFrom, Demanded,
-                                   KnownZero, KnownOne, TLO)) {
-        DCI.CommitTargetLoweringOpt(TLO);
-      }
-    }
-
-    break;
-  }
-
-  case ISD::STORE:
-    return performStoreCombine(N, DCI);
-  }
-  return SDValue();
-}
-
-//===----------------------------------------------------------------------===//
-// Helper functions
-//===----------------------------------------------------------------------===//
-
-void AMDGPUTargetLowering::getOriginalFunctionArgs(
-                               SelectionDAG &DAG,
-                               const Function *F,
-                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                               SmallVectorImpl<ISD::InputArg> &OrigIns) const {
-
-  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
-    if (Ins[i].ArgVT == Ins[i].VT) {
-      OrigIns.push_back(Ins[i]);
-      continue;
-    }
-
-    EVT VT;
-    if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
-      // Vector has been split into scalars.
-      VT = Ins[i].ArgVT.getVectorElementType();
-    } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
-               Ins[i].ArgVT.getVectorElementType() !=
-               Ins[i].VT.getVectorElementType()) {
-      // Vector elements have been promoted
-      VT = Ins[i].ArgVT;
-    } else {
-      // Vector has been spilt into smaller vectors.
-      VT = Ins[i].VT;
-    }
-
-    ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
-                      Ins[i].OrigArgIndex, Ins[i].PartOffset);
-    OrigIns.push_back(Arg);
-  }
-}
-
-bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
-  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
-    return CFP->isExactlyValue(1.0);
-  }
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-    return C->isAllOnesValue();
-  }
-  return false;
-}
-
-bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
-  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
-    return CFP->getValueAPF().isZero();
-  }
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-    return C->isNullValue();
-  }
-  return false;
-}
-
-SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
-                                                  const TargetRegisterClass *RC,
-                                                   unsigned Reg, EVT VT) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  unsigned VirtualRegister;
-  if (!MRI.isLiveIn(Reg)) {
-    VirtualRegister = MRI.createVirtualRegister(RC);
-    MRI.addLiveIn(Reg, VirtualRegister);
-  } else {
-    VirtualRegister = MRI.getLiveInVirtReg(Reg);
-  }
-  return DAG.getRegister(VirtualRegister, VT);
-}
-
-#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
-
-const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch ((AMDGPUISD::NodeType)Opcode) {
-  case AMDGPUISD::FIRST_NUMBER: break;
-  // AMDIL DAG nodes
-  NODE_NAME_CASE(CALL);
-  NODE_NAME_CASE(UMUL);
-  NODE_NAME_CASE(RET_FLAG);
-  NODE_NAME_CASE(BRANCH_COND);
-
-  // AMDGPU DAG nodes
-  NODE_NAME_CASE(DWORDADDR)
-  NODE_NAME_CASE(FRACT)
-  NODE_NAME_CASE(CLAMP)
-  NODE_NAME_CASE(COS_HW)
-  NODE_NAME_CASE(SIN_HW)
-  NODE_NAME_CASE(FMAX_LEGACY)
-  NODE_NAME_CASE(FMIN_LEGACY)
-  NODE_NAME_CASE(FMAX3)
-  NODE_NAME_CASE(SMAX3)
-  NODE_NAME_CASE(UMAX3)
-  NODE_NAME_CASE(FMIN3)
-  NODE_NAME_CASE(SMIN3)
-  NODE_NAME_CASE(UMIN3)
-  NODE_NAME_CASE(URECIP)
-  NODE_NAME_CASE(DIV_SCALE)
-  NODE_NAME_CASE(DIV_FMAS)
-  NODE_NAME_CASE(DIV_FIXUP)
-  NODE_NAME_CASE(TRIG_PREOP)
-  NODE_NAME_CASE(RCP)
-  NODE_NAME_CASE(RSQ)
-  NODE_NAME_CASE(RSQ_LEGACY)
-  NODE_NAME_CASE(RSQ_CLAMPED)
-  NODE_NAME_CASE(LDEXP)
-  NODE_NAME_CASE(FP_CLASS)
-  NODE_NAME_CASE(DOT4)
-  NODE_NAME_CASE(CARRY)
-  NODE_NAME_CASE(BORROW)
-  NODE_NAME_CASE(BFE_U32)
-  NODE_NAME_CASE(BFE_I32)
-  NODE_NAME_CASE(BFI)
-  NODE_NAME_CASE(BFM)
-  NODE_NAME_CASE(BREV)
-  NODE_NAME_CASE(MUL_U24)
-  NODE_NAME_CASE(MUL_I24)
-  NODE_NAME_CASE(MAD_U24)
-  NODE_NAME_CASE(MAD_I24)
-  NODE_NAME_CASE(TEXTURE_FETCH)
-  NODE_NAME_CASE(EXPORT)
-  NODE_NAME_CASE(CONST_ADDRESS)
-  NODE_NAME_CASE(REGISTER_LOAD)
-  NODE_NAME_CASE(REGISTER_STORE)
-  NODE_NAME_CASE(LOAD_CONSTANT)
-  NODE_NAME_CASE(LOAD_INPUT)
-  NODE_NAME_CASE(SAMPLE)
-  NODE_NAME_CASE(SAMPLEB)
-  NODE_NAME_CASE(SAMPLED)
-  NODE_NAME_CASE(SAMPLEL)
-  NODE_NAME_CASE(CVT_F32_UBYTE0)
-  NODE_NAME_CASE(CVT_F32_UBYTE1)
-  NODE_NAME_CASE(CVT_F32_UBYTE2)
-  NODE_NAME_CASE(CVT_F32_UBYTE3)
-  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
-  NODE_NAME_CASE(CONST_DATA_PTR)
-  case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
-  NODE_NAME_CASE(SENDMSG)
-  NODE_NAME_CASE(INTERP_MOV)
-  NODE_NAME_CASE(INTERP_P1)
-  NODE_NAME_CASE(INTERP_P2)
-  NODE_NAME_CASE(STORE_MSKOR)
-  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
-  case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
-  }
-  return nullptr;
-}
-
-SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
-                                               DAGCombinerInfo &DCI,
-                                               unsigned &RefinementSteps,
-                                               bool &UseOneConstNR) const {
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = Operand.getValueType();
-
-  if (VT == MVT::f32) {
-    RefinementSteps = 0;
-    return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
-  }
-
-  // TODO: There is also f64 rsq instruction, but the documentation is less
-  // clear on its precision.
-
-  return SDValue();
-}
-
-SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
-                                               DAGCombinerInfo &DCI,
-                                               unsigned &RefinementSteps) const {
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = Operand.getValueType();
-
-  if (VT == MVT::f32) {
-    // Reciprocal, < 1 ulp error.
-    //
-    // This reciprocal approximation converges to < 0.5 ulp error with one
-    // newton rhapson performed with two fused multiple adds (FMAs).
-
-    RefinementSteps = 0;
-    return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
-  }
-
-  // TODO: There is also f64 rcp instruction, but the documentation is less
-  // clear on its precision.
-
-  return SDValue();
-}
-
-static void computeKnownBitsForMinMax(const SDValue Op0,
-                                      const SDValue Op1,
-                                      APInt &KnownZero,
-                                      APInt &KnownOne,
-                                      const SelectionDAG &DAG,
-                                      unsigned Depth) {
-  APInt Op0Zero, Op0One;
-  APInt Op1Zero, Op1One;
-  DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
-  DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);
-
-  KnownZero = Op0Zero & Op1Zero;
-  KnownOne = Op0One & Op1One;
-}
-
-void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
-  const SDValue Op,
-  APInt &KnownZero,
-  APInt &KnownOne,
-  const SelectionDAG &DAG,
-  unsigned Depth) const {
-
-  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
-
-  APInt KnownZero2;
-  APInt KnownOne2;
-  unsigned Opc = Op.getOpcode();
-
-  switch (Opc) {
-  default:
-    break;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    // FIXME: The intrinsic should just use the node.
-    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
-    case AMDGPUIntrinsic::AMDGPU_imax:
-    case AMDGPUIntrinsic::AMDGPU_umax:
-    case AMDGPUIntrinsic::AMDGPU_imin:
-    case AMDGPUIntrinsic::AMDGPU_umin:
-      computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
-                                KnownZero, KnownOne, DAG, Depth);
-      break;
-    default:
-      break;
-    }
-
-    break;
-  }
-  case AMDGPUISD::CARRY:
-  case AMDGPUISD::BORROW: {
-    KnownZero = APInt::getHighBitsSet(32, 31);
-    break;
-  }
-
-  case AMDGPUISD::BFE_I32:
-  case AMDGPUISD::BFE_U32: {
-    ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
-    if (!CWidth)
-      return;
-
-    unsigned BitWidth = 32;
-    uint32_t Width = CWidth->getZExtValue() & 0x1f;
-
-    if (Opc == AMDGPUISD::BFE_U32)
-      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
-
-    break;
-  }
-  }
-}
-
-unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
-  SDValue Op,
-  const SelectionDAG &DAG,
-  unsigned Depth) const {
-  switch (Op.getOpcode()) {
-  case AMDGPUISD::BFE_I32: {
-    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
-    if (!Width)
-      return 1;
-
-    unsigned SignBits = 32 - Width->getZExtValue() + 1;
-    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-    if (!Offset || !Offset->isNullValue())
-      return SignBits;
-
-    // TODO: Could probably figure something out with non-0 offsets.
-    unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
-    return std::max(SignBits, Op0SignBits);
-  }
-
-  case AMDGPUISD::BFE_U32: {
-    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
-    return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
-  }
-
-  case AMDGPUISD::CARRY:
-  case AMDGPUISD::BORROW:
-    return 31;
-
-  default:
-    return 1;
-  }
-}
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
deleted file mode 100644
index fbb7d3c8843..00000000000
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ /dev/null
@@ -1,307 +0,0 @@
-//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Interface definition of the TargetLowering class that is common
-/// to all AMD GPUs.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H
-#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H
-
-#include "llvm/Target/TargetLowering.h"
-
-namespace llvm {
-
-class AMDGPUMachineFunction;
-class AMDGPUSubtarget;
-class MachineRegisterInfo;
-
-class AMDGPUTargetLowering : public TargetLowering {
-protected:
-  const AMDGPUSubtarget *Subtarget;
-
-private:
-  SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV,
-                                   const SDValue &InitPtr,
-                                   SDValue Chain,
-                                   SelectionDAG &DAG) const;
-  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
-  /// \brief Lower vector stores by merging the vector elements into an integer
-  /// of the same bitwidth.
-  SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
-  /// \brief Split a vector store into multiple scalar stores.
-  /// \returns The resulting chain.
-
-  SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
-
-  SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
-
-  SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
-  SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
-
-  SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
-  SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
-
-  SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
-
-  SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-
-protected:
-  static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
-  static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT);
-
-  virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
-                                     SelectionDAG &DAG) const;
-
-  /// \brief Split a vector load into a scalar load of each component.
-  SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const;
-
-  /// \brief Split a vector load into 2 loads of half the vector.
-  SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
-
-  /// \brief Split a vector store into a scalar store of each component.
-  SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const;
-
-  /// \brief Split a vector store into 2 stores of half the vector.
-  SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
-
-  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
-  void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
-                                    SmallVectorImpl<SDValue> &Results) const;
-  bool isHWTrueValue(SDValue Op) const;
-  bool isHWFalseValue(SDValue Op) const;
-
-  /// The SelectionDAGBuilder will automatically promote function arguments
-  /// with illegal types.  However, this does not work for the AMDGPU targets
-  /// since the function arguments are stored in memory as these illegal types.
-  /// In order to handle this properly we need to get the origianl types sizes
-  /// from the LLVM IR Function and fixup the ISD:InputArg values before
-  /// passing them to AnalyzeFormalArguments()
-  void getOriginalFunctionArgs(SelectionDAG &DAG,
-                               const Function *F,
-                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                               SmallVectorImpl<ISD::InputArg> &OrigIns) const;
-  void AnalyzeFormalArguments(CCState &State,
-                              const SmallVectorImpl<ISD::InputArg> &Ins) const;
-
-public:
-  AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
-
-  bool isFAbsFree(EVT VT) const override;
-  bool isFNegFree(EVT VT) const override;
-  bool isTruncateFree(EVT Src, EVT Dest) const override;
-  bool isTruncateFree(Type *Src, Type *Dest) const override;
-
-  bool isZExtFree(Type *Src, Type *Dest) const override;
-  bool isZExtFree(EVT Src, EVT Dest) const override;
-  bool isZExtFree(SDValue Val, EVT VT2) const override;
-
-  bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
-
-  MVT getVectorIdxTy() const override;
-  bool isSelectSupported(SelectSupportKind) const override;
-
-  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
-  bool ShouldShrinkFPConstant(EVT VT) const override;
-  bool shouldReduceLoadWidth(SDNode *Load,
-                             ISD::LoadExtType ExtType,
-                             EVT ExtVT) const override;
-
-  bool isLoadBitCastBeneficial(EVT, EVT) const override;
-
-  bool storeOfVectorConstantIsCheap(EVT MemVT,
-                                    unsigned NumElem,
-                                    unsigned AS) const override;
-  bool isCheapToSpeculateCttz() const override;
-  bool isCheapToSpeculateCtlz() const override;
-
-  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                      bool isVarArg,
-                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals,
-                      SDLoc DL, SelectionDAG &DAG) const override;
-  SDValue LowerCall(CallLoweringInfo &CLI,
-                    SmallVectorImpl<SDValue> &InVals) const override;
-
-  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
-  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
-  void ReplaceNodeResults(SDNode * N,
-                          SmallVectorImpl<SDValue> &Results,
-                          SelectionDAG &DAG) const override;
-
-  SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue CombineFMinMaxLegacy(SDLoc DL,
-                               EVT VT,
-                               SDValue LHS,
-                               SDValue RHS,
-                               SDValue True,
-                               SDValue False,
-                               SDValue CC,
-                               DAGCombinerInfo &DCI) const;
-  SDValue CombineIMinMax(SDLoc DL,
-                         EVT VT,
-                         SDValue LHS,
-                         SDValue RHS,
-                         SDValue True,
-                         SDValue False,
-                         SDValue CC,
-                         SelectionDAG &DAG) const;
-
-  const char* getTargetNodeName(unsigned Opcode) const override;
-
-  SDValue getRsqrtEstimate(SDValue Operand,
-                           DAGCombinerInfo &DCI,
-                           unsigned &RefinementSteps,
-                           bool &UseOneConstNR) const override;
-  SDValue getRecipEstimate(SDValue Operand,
-                           DAGCombinerInfo &DCI,
-                           unsigned &RefinementSteps) const override;
-
-  virtual SDNode *PostISelFolding(MachineSDNode *N,
-                                  SelectionDAG &DAG) const {
-    return N;
-  }
-
-  /// \brief Determine which of the bits specified in \p Mask are known to be
-  /// either zero or one and return them in the \p KnownZero and \p KnownOne
-  /// bitsets.
-  void computeKnownBitsForTargetNode(const SDValue Op,
-                                     APInt &KnownZero,
-                                     APInt &KnownOne,
-                                     const SelectionDAG &DAG,
-                                     unsigned Depth = 0) const override;
-
-  unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG,
-                                           unsigned Depth = 0) const override;
-
-  /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
-  /// MachineFunction.
-  ///
-  /// \returns a RegisterSDNode representing Reg.
-  virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
-                                       const TargetRegisterClass *RC,
-                                       unsigned Reg, EVT VT) const;
-};
-
-namespace AMDGPUISD {
-
-enum NodeType : unsigned {
-  // AMDIL ISD Opcodes
-  FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  CALL,        // Function call based on a single integer
-  UMUL,        // 32bit unsigned multiplication
-  RET_FLAG,
-  BRANCH_COND,
-  // End AMDIL ISD Opcodes
-  DWORDADDR,
-  FRACT,
-  CLAMP,
-
-  // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
-  // Denormals handled on some parts.
-  COS_HW,
-  SIN_HW,
-  FMAX_LEGACY,
-  FMIN_LEGACY,
-  FMAX3,
-  SMAX3,
-  UMAX3,
-  FMIN3,
-  SMIN3,
-  UMIN3,
-  URECIP,
-  DIV_SCALE,
-  DIV_FMAS,
-  DIV_FIXUP,
-  TRIG_PREOP, // 1 ULP max error for f64
-
-  // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
-  //            For f64, max error 2^29 ULP, handles denormals.
-  RCP,
-  RSQ,
-  RSQ_LEGACY,
-  RSQ_CLAMPED,
-  LDEXP,
-  FP_CLASS,
-  DOT4,
-  CARRY,
-  BORROW,
-  BFE_U32, // Extract range of bits with zero extension to 32-bits.
-  BFE_I32, // Extract range of bits with sign extension to 32-bits.
-  BFI, // (src0 & src1) | (~src0 & src2)
-  BFM, // Insert a range of bits into a 32-bit word.
-  BREV, // Reverse bits.
-  MUL_U24,
-  MUL_I24,
-  MAD_U24,
-  MAD_I24,
-  TEXTURE_FETCH,
-  EXPORT,
-  CONST_ADDRESS,
-  REGISTER_LOAD,
-  REGISTER_STORE,
-  LOAD_INPUT,
-  SAMPLE,
-  SAMPLEB,
-  SAMPLED,
-  SAMPLEL,
-
-  // These cvt_f32_ubyte* nodes need to remain consecutive and in order.
-  CVT_F32_UBYTE0,
-  CVT_F32_UBYTE1,
-  CVT_F32_UBYTE2,
-  CVT_F32_UBYTE3,
-  /// This node is for VLIW targets and it is used to represent a vector
-  /// that is stored in consecutive registers with the same channel.
-  /// For example:
-  ///   |X  |Y|Z|W|
-  /// T0|v.x| | | |
-  /// T1|v.y| | | |
-  /// T2|v.z| | | |
-  /// T3|v.w| | | |
-  BUILD_VERTICAL_VECTOR,
-  /// Pointer to the start of the shader's constant data.
-  CONST_DATA_PTR,
-  SENDMSG,
-  INTERP_MOV,
-  INTERP_P1,
-  INTERP_P2,
-  FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
-  STORE_MSKOR,
-  LOAD_CONSTANT,
-  TBUFFER_STORE_FORMAT,
-  LAST_AMDGPU_ISD_NUMBER
-};
-
-
-} // End namespace AMDGPUISD
-
-} // End namespace llvm
-
-#endif
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
deleted file mode 100644
index 15a3d543a68..00000000000
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ /dev/null
@@ -1,369 +0,0 @@
-//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Implementation of the TargetInstrInfo class that is common to all
-/// AMD GPUs.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUInstrInfo.h"
-#include "AMDGPURegisterInfo.h"
-#include "AMDGPUTargetMachine.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
-using namespace llvm;
-
-#define GET_INSTRINFO_CTOR_DTOR
-#define GET_INSTRINFO_NAMED_OPS
-#define GET_INSTRMAP_INFO
-#include "AMDGPUGenInstrInfo.inc"
-
-// Pin the vtable to this file.
-void AMDGPUInstrInfo::anchor() {}
-
-AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st)
-    : AMDGPUGenInstrInfo(-1, -1), ST(st) {}
-
-const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
-  return RI;
-}
-
-bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
-                                           unsigned &SrcReg, unsigned &DstReg,
-                                           unsigned &SubIdx) const {
-// TODO: Implement this function
-  return false;
-}
-
-unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
-                                             int &FrameIndex) const {
-// TODO: Implement this function
-  return 0;
-}
-
-unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
-                                                   int &FrameIndex) const {
-// TODO: Implement this function
-  return 0;
-}
-
-bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
-                                          const MachineMemOperand *&MMO,
-                                          int &FrameIndex) const {
-// TODO: Implement this function
-  return false;
-}
-unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI,
-                                              int &FrameIndex) const {
-// TODO: Implement this function
-  return 0;
-}
-unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI,
-                                                    int &FrameIndex) const {
-// TODO: Implement this function
-  return 0;
-}
-bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI,
-                                           const MachineMemOperand *&MMO,
-                                           int &FrameIndex) const {
-// TODO: Implement this function
-  return false;
-}
-
-MachineInstr *
-AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
-                                      MachineBasicBlock::iterator &MBBI,
-                                      LiveVariables *LV) const {
-// TODO: Implement this function
-  return nullptr;
-}
-
-void
-AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI,
-                                    unsigned SrcReg, bool isKill,
-                                    int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const {
-  llvm_unreachable("Not Implemented");
-}
-
-void
-AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MI,
-                                     unsigned DestReg, int FrameIndex,
-                                     const TargetRegisterClass *RC,
-                                     const TargetRegisterInfo *TRI) const {
-  llvm_unreachable("Not Implemented");
-}
-
-bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const {
-  MachineBasicBlock *MBB = MI->getParent();
-  int OffsetOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                               AMDGPU::OpName::addr);
-   // addr is a custom operand with multiple MI operands, and only the
-   // first MI operand is given a name.
-  int RegOpIdx = OffsetOpIdx + 1;
-  int ChanOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                             AMDGPU::OpName::chan);
-  if (isRegisterLoad(*MI)) {
-    int DstOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                              AMDGPU::OpName::dst);
-    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
-    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
-    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
-    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
-    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
-      buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
-                    getIndirectAddrRegClass()->getRegister(Address));
-    } else {
-      buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
-                        Address, OffsetReg);
-    }
-  } else if (isRegisterStore(*MI)) {
-    int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                              AMDGPU::OpName::val);
-    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
-    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
-    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
-    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
-    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
-      buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
-                    MI->getOperand(ValOpIdx).getReg());
-    } else {
-      buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(),
-                         calculateIndirectAddress(RegIndex, Channel),
-                         OffsetReg);
-    }
-  } else {
-    return false;
-  }
-
-  MBB->erase(MI);
-  return true;
-}
-
-MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
-// TODO: Implement this function
-  return nullptr;
-}
-MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const {
-  // TODO: Implement this function
-  return nullptr;
-}
-bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
-                                           ArrayRef<unsigned> Ops) const {
-  // TODO: Implement this function
-  return false;
-}
-bool
-AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
-                                 unsigned Reg, bool UnfoldLoad,
-                                 bool UnfoldStore,
-                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  // TODO: Implement this function
-  return false;
-}
-
-bool
-AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
-                                    SmallVectorImpl<SDNode*> &NewNodes) const {
-  // TODO: Implement this function
-  return false;
-}
-
-unsigned
-AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
-                                           bool UnfoldLoad, bool UnfoldStore,
-                                           unsigned *LoadRegIndex) const {
-  // TODO: Implement this function
-  return 0;
-}
-
-bool AMDGPUInstrInfo::enableClusterLoads() const {
-  return true;
-}
-
-// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
-// the first 16 loads will be interleaved with the stores, and the next 16 will
-// be clustered as expected. It should really split into 2 16 store batches.
-//
-// Loads are clustered until this returns false, rather than trying to schedule
-// groups of stores. This also means we have to deal with saying different
-// address space loads should be clustered, and ones which might cause bank
-// conflicts.
-//
-// This might be deprecated so it might not be worth that much effort to fix.
-bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
-                                              int64_t Offset0, int64_t Offset1,
-                                              unsigned NumLoads) const {
-  assert(Offset1 > Offset0 &&
-         "Second offset should be larger than first offset!");
-  // If we have less than 16 loads in a row, and the offsets are within 64
-  // bytes, then schedule together.
-
-  // A cacheline is 64 bytes (for global memory).
-  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
-}
-
-bool
-AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
-  const {
-  // TODO: Implement this function
-  return true;
-}
-void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI) const {
-  // TODO: Implement this function
-}
-
-bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const {
-  // TODO: Implement this function
-  return false;
-}
-
-bool AMDGPUInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
-                                        ArrayRef<MachineOperand> Pred2) const {
-  // TODO: Implement this function
-  return false;
-}
-
-bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI,
-                                      std::vector<MachineOperand> &Pred) const {
-  // TODO: Implement this function
-  return false;
-}
-
-bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const {
-  // TODO: Implement this function
-  return MI->getDesc().isPredicable();
-}
-
-bool
-AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
-  // TODO: Implement this function
-  return true;
-}
-
-bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const {
-  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE;
-}
-
-bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const {
-  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
-}
-
-int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  int Offset = -1;
-
-  if (MFI->getNumObjects() == 0) {
-    return -1;
-  }
-
-  if (MRI.livein_empty()) {
-    return 0;
-  }
-
-  const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
-  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
-                                            LE = MRI.livein_end();
-                                            LI != LE; ++LI) {
-    unsigned Reg = LI->first;
-    if (TargetRegisterInfo::isVirtualRegister(Reg) ||
-        !IndirectRC->contains(Reg))
-      continue;
-
-    unsigned RegIndex;
-    unsigned RegEnd;
-    for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
-                                                          ++RegIndex) {
-      if (IndirectRC->getRegister(RegIndex) == Reg)
-        break;
-    }
-    Offset = std::max(Offset, (int)RegIndex);
-  }
-
-  return Offset + 1;
-}
-
-int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
-  int Offset = 0;
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Variable sized objects are not supported
-  assert(!MFI->hasVarSizedObjects());
-
-  if (MFI->getNumObjects() == 0) {
-    return -1;
-  }
-
-  Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1);
-
-  return getIndirectIndexBegin(MF) + Offset;
-}
-
-int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
-  switch (Channels) {
-  default: return Opcode;
-  case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1);
-  case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2);
-  case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
-  }
-}
-
-// Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
-// header files, so we need to wrap it in a function that takes unsigned
-// instead.
-namespace llvm {
-namespace AMDGPU {
-static int getMCOpcode(uint16_t Opcode, unsigned Gen) {
-  return getMCOpcodeGen(Opcode, (enum Subtarget)Gen);
-}
-}
-}
-
-// This must be kept in sync with the SISubtarget class in SIInstrInfo.td
-enum SISubtarget {
-  SI = 0,
-  VI = 1
-};
-
-static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) {
-  switch (Gen) {
-  default:
-    return SI;
-  case AMDGPUSubtarget::VOLCANIC_ISLANDS:
-    return VI;
-  }
-}
-
-int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
-  int MCOp = AMDGPU::getMCOpcode(
-      Opcode, AMDGPUSubtargetToSISubtarget(ST.getGeneration()));
-
-  // -1 means that Opcode is already a native instruction.
-  if (MCOp == -1)
-    return Opcode;
-
-  // (uint16_t)-1 means that Opcode is a pseudo instruction that has
-  // no encoding in the given subtarget generation.
-  if (MCOp == (uint16_t)-1)
-    return -1;
-
-  return MCOp;
-}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
deleted file mode 100644
index 86d3962b385..00000000000
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ /dev/null
@@ -1,206 +0,0 @@
-//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Contains the definition of a TargetInstrInfo class that is common
-/// to all AMD GPUs.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H
-#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H
-
-#include "AMDGPURegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include <map>
-
-#define GET_INSTRINFO_HEADER
-#define GET_INSTRINFO_ENUM
-#define GET_INSTRINFO_OPERAND_ENUM
-#include "AMDGPUGenInstrInfo.inc"
-
-#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT
-#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT
-#define OPCODE_IS_ZERO AMDGPU::PRED_SETE
-#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE
-
-namespace llvm {
-
-class AMDGPUSubtarget;
-class MachineFunction;
-class MachineInstr;
-class MachineInstrBuilder;
-
-class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
-private:
-  const AMDGPURegisterInfo RI;
-  virtual void anchor();
-protected:
-  const AMDGPUSubtarget &ST;
-public:
-  explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
-
-  virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
-
-  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
-                             unsigned &DstReg, unsigned &SubIdx) const override;
-
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                               int &FrameIndex) const override;
-  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
-                                     int &FrameIndex) const override;
-  bool hasLoadFromStackSlot(const MachineInstr *MI,
-                            const MachineMemOperand *&MMO,
-                            int &FrameIndex) const override;
-  unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
-  unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
-                                      int &FrameIndex) const;
-  bool hasStoreFromStackSlot(const MachineInstr *MI,
-                             const MachineMemOperand *&MMO,
-                             int &FrameIndex) const;
-
-  MachineInstr *
-  convertToThreeAddress(MachineFunction::iterator &MFI,
-                        MachineBasicBlock::iterator &MBBI,
-                        LiveVariables *LV) const override;
-
-
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
-
-  void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
-                           const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const override;
-  void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            unsigned DestReg, int FrameIndex,
-                            const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const override;
-
-protected:
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      ArrayRef<unsigned> Ops,
-                                      MachineBasicBlock::iterator InsertPt,
-                                      int FrameIndex) const override;
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      ArrayRef<unsigned> Ops,
-                                      MachineBasicBlock::iterator InsertPt,
-                                      MachineInstr *LoadMI) const override;
-
-public:
-  /// \returns the smallest register index that will be accessed by an indirect
-  /// read or write or -1 if indirect addressing is not used by this program.
-  int getIndirectIndexBegin(const MachineFunction &MF) const;
-
-  /// \returns the largest register index that will be accessed by an indirect
-  /// read or write or -1 if indirect addressing is not used by this program.
-  int getIndirectIndexEnd(const MachineFunction &MF) const;
-
-  bool canFoldMemoryOperand(const MachineInstr *MI,
-                            ArrayRef<unsigned> Ops) const override;
-  bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
-                        unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
-                        SmallVectorImpl<MachineInstr *> &NewMIs) const override;
-  bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
-                           SmallVectorImpl<SDNode *> &NewNodes) const override;
-  unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
-                               bool UnfoldLoad, bool UnfoldStore,
-                               unsigned *LoadRegIndex = nullptr) const override;
-
-  bool enableClusterLoads() const override;
-
-  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
-                               int64_t Offset1, int64_t Offset2,
-                               unsigned NumLoads) const override;
-
-  bool
-  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
-  void insertNoop(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator MI) const override;
-  bool isPredicated(const MachineInstr *MI) const override;
-  bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
-                         ArrayRef<MachineOperand> Pred2) const override;
-  bool DefinesPredicate(MachineInstr *MI,
-                        std::vector<MachineOperand> &Pred) const override;
-  bool isPredicable(MachineInstr *MI) const override;
-  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
-
-  // Helper functions that check the opcode for status information
-  bool isRegisterStore(const MachineInstr &MI) const;
-  bool isRegisterLoad(const MachineInstr &MI) const;
-
-  /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
-  /// Return -1 if the target-specific opcode for the pseudo instruction does
-  /// not exist. If Opcode is not a pseudo instruction, this is identity.
-  int pseudoToMCOpcode(int Opcode) const;
-
-  /// \brief Return the descriptor of the target-specific machine instruction
-  /// that corresponds to the specified pseudo or native opcode.
-  const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
-    return get(pseudoToMCOpcode(Opcode));
-  }
-
-//===---------------------------------------------------------------------===//
-// Pure virtual funtions to be implemented by sub-classes.
-//===---------------------------------------------------------------------===//
-
-  virtual bool isMov(unsigned opcode) const = 0;
-
-  /// \brief Calculate the "Indirect Address" for the given \p RegIndex and
-  ///        \p Channel
-  ///
-  /// We model indirect addressing using a virtual address space that can be
-  /// accesed with loads and stores.  The "Indirect Address" is the memory
-  /// address in this virtual address space that maps to the given \p RegIndex
-  /// and \p Channel.
-  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
-                                            unsigned Channel) const = 0;
-
-  /// \returns The register class to be used for loading and storing values
-  /// from an "Indirect Address" .
-  virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0;
-
-  /// \brief Build instruction(s) for an indirect register write.
-  ///
-  /// \returns The instruction that performs the indirect register write
-  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                    MachineBasicBlock::iterator I,
-                                    unsigned ValueReg, unsigned Address,
-                                    unsigned OffsetReg) const = 0;
-
-  /// \brief Build instruction(s) for an indirect register read.
-  ///
-  /// \returns The instruction that performs the indirect register read
-  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
-                                    MachineBasicBlock::iterator I,
-                                    unsigned ValueReg, unsigned Address,
-                                    unsigned OffsetReg) const = 0;
-
-  /// \brief Build a MOV instruction.
-  virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
-                                      MachineBasicBlock::iterator I,
-                                      unsigned DstReg, unsigned SrcReg) const = 0;
-
-  /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the
-  /// equivalent opcode that writes \p Channels Channels.
-  int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
-
-};
-
-namespace AMDGPU {
-  int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex);
-}  // End namespace AMDGPU
-
-} // End llvm namespace
-
-#define AMDGPU_FLAG_REGISTER_LOAD  (UINT64_C(1) << 63)
-#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62)
-
-#endif
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
deleted file mode 100644
index b413897d9d2..00000000000
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ /dev/null
@@ -1,245 +0,0 @@
-//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains DAG node defintions for the AMDGPU target.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// AMDGPU DAG Profiles
-//===----------------------------------------------------------------------===//
-
-def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
-  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
-]>;
-
-def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
-  [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
->;
-
-def AMDGPULdExpOp : SDTypeProfile<1, 2,
-  [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
->;
-
-def AMDGPUFPClassOp : SDTypeProfile<1, 2,
-  [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>]
->;
-
-def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
-  [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
->;
-
-// float, float, float, vcc
-def AMDGPUFmasOp : SDTypeProfile<1, 4,
-  [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>]
->;
-
-//===----------------------------------------------------------------------===//
-// AMDGPU DAG Nodes
-//
-
-// This argument to this node is a dword address.
-def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
-
-def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
-def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
-
-// out = a - floor(a)
-def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
-
-// out = 1.0 / a
-def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
-
-// out = 1.0 / sqrt(a)
-def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
-
-// out = 1.0 / sqrt(a)
-def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
-
-// out = 1.0 / sqrt(a) result clamped to +/- max_float.
-def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>;
-
-def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
-
-def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
-
-// out = max(a, b) a and b are floats, where a nan comparison fails.
-// This is not commutative because this gives the second operand:
-//   x < nan ? x : nan -> nan
-//   nan < x ? nan : x -> x
-def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp,
-  []
->;
-
-def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
-
-// out = max(a, b) a and b are signed ints
-def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
-  [SDNPCommutative, SDNPAssociative]
->;
-
-// out = max(a, b) a and b are unsigned ints
-def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
-  [SDNPCommutative, SDNPAssociative]
->;
-
-// out = min(a, b) a and b are floats, where a nan comparison fails.
-def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
-  []
->;
-
-// FIXME: TableGen doesn't like commutative instructions with more
-// than 2 operands.
-// out = max(a, b, c) a, b and c are floats
-def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp,
-  [/*SDNPCommutative, SDNPAssociative*/]
->;
-
-// out = max(a, b, c) a, b, and c are signed ints
-def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp,
-  [/*SDNPCommutative, SDNPAssociative*/]
->;
-
-// out = max(a, b, c) a, b and c are unsigned ints
-def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp,
-  [/*SDNPCommutative, SDNPAssociative*/]
->;
-
-// out = min(a, b, c) a, b and c are floats
-def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp,
-  [/*SDNPCommutative, SDNPAssociative*/]
->;
-
-// out = min(a, b, c) a, b and c are signed ints
-def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp,
-  [/*SDNPCommutative, SDNPAssociative*/]
->;
-
-// out = min(a, b) a and b are unsigned ints
-def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp,
-  [/*SDNPCommutative, SDNPAssociative*/]
->;
-
-// out = (src0 + src1 > 0xFFFFFFFF) ? 1 : 0
-def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>;
-
-// out = (src1 > src0) ? 1 : 0
-def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>;
-
-
-def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
-  SDTIntToFPOp, []>;
-def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
-  SDTIntToFPOp, []>;
-def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2",
-  SDTIntToFPOp, []>;
-def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
-  SDTIntToFPOp, []>;
-
-
-// urecip - This operation is a helper for integer division, it returns the
-// result of 1 / a as a fractional unsigned integer.
-// out = (2^32 / a) + e
-// e is rounding error
-def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
-
-// Special case divide preop and flags.
-def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
-
-//  Special case divide FMA with scale and flags (src0 = Quotient,
-//  src1 = Denominator, src2 = Numerator).
-def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>;
-
-// Single or double precision division fixup.
-// Special case divide fixup and flags(src0 = Quotient, src1 =
-// Denominator, src2 = Numerator).
-def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
-
-// Look Up 2.0 / pi src0 with segment select src1[4:0]
-def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
-
-def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
-                          SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
-                          [SDNPHasChain, SDNPMayLoad]>;
-
-def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE",
-                           SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
-                           [SDNPHasChain, SDNPMayStore]>;
-
-// MSKOR instructions are atomic memory instructions used mainly for storing
-// 8-bit and 16-bit values.  The definition is:
-//
-// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src)
-//
-// src0: vec4(src, 0, 0, mask)
-// src1: dst - rat offset (aka pointer) in dwords
-def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
-                        SDTypeProfile<0, 2, []>,
-                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-
-def AMDGPUround : SDNode<"ISD::FROUND",
-                         SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
-
-def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
-def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
-def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
-def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
-
-def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>;
-
-// Signed and unsigned 24-bit mulitply.  The highest 8-bits are ignore when
-// performing the mulitply.  The result is a 32-bit value.
-def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
-  [SDNPCommutative]
->;
-def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
-  [SDNPCommutative]
->;
-
-def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
-  []
->;
-def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
-  []
->;
-
-def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
-                    SDTypeProfile<0, 1, [SDTCisInt<0>]>,
-                    [SDNPHasChain, SDNPInGlue]>;
-
-def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV",
-                        SDTypeProfile<1, 3, [SDTCisFP<0>]>,
-                        [SDNPInGlue]>;
-
-def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1",
-                      SDTypeProfile<1, 3, [SDTCisFP<0>]>,
-                      [SDNPInGlue, SDNPOutGlue]>;
-
-def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2",
-                      SDTypeProfile<1, 4, [SDTCisFP<0>]>,
-                      [SDNPInGlue]>;
-
-//===----------------------------------------------------------------------===//
-// Flow Control Profile Types
-//===----------------------------------------------------------------------===//
-// Branch instruction where second and third are basic blocks
-def SDTIL_BRCond : SDTypeProfile<0, 2, [
-    SDTCisVT<0, OtherVT>
-    ]>;
-
-//===----------------------------------------------------------------------===//
-// Flow Control DAG Nodes
-//===----------------------------------------------------------------------===//
-def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
-
-//===----------------------------------------------------------------------===//
-// Call/Return DAG Nodes
-//===----------------------------------------------------------------------===//
-def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
-    [SDNPHasChain, SDNPOptInGlue]>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
deleted file mode 100644
index 72cab39277c..00000000000
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ /dev/null
@@ -1,682 +0,0 @@
-//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains instruction defs that are common to all hw codegen
-// targets.
-//
-//===----------------------------------------------------------------------===//
-
-class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
-  field bit isRegisterLoad = 0;
-  field bit isRegisterStore = 0;
-
-  let Namespace = "AMDGPU";
-  let OutOperandList = outs;
-  let InOperandList = ins;
-  let AsmString = asm;
-  let Pattern = pattern;
-  let Itinerary = NullALU;
-
-  let TSFlags{63} = isRegisterLoad;
-  let TSFlags{62} = isRegisterStore;
-}
-
-class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
-    : AMDGPUInst<outs, ins, asm, pattern> {
-
-  field bits<32> Inst = 0xffffffff;
-
-}
-
-def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">;
-def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">;
-def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
-
-def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
-def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
-
-let OperandType = "OPERAND_IMMEDIATE" in {
-
-def u32imm : Operand<i32> {
-  let PrintMethod = "printU32ImmOperand";
-}
-
-def u16imm : Operand<i16> {
-  let PrintMethod = "printU16ImmOperand";
-}
-
-def u8imm : Operand<i8> {
-  let PrintMethod = "printU8ImmOperand";
-}
-
-} // End OperandType = "OPERAND_IMMEDIATE"
-
-//===--------------------------------------------------------------------===//
-// Custom Operands
-//===--------------------------------------------------------------------===//
-def brtarget   : Operand<OtherVT>;
-
-//===----------------------------------------------------------------------===//
-// PatLeafs for floating-point comparisons
-//===----------------------------------------------------------------------===//
-
-def COND_OEQ : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}]
->;
-
-def COND_ONE : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}]
->;
-
-def COND_OGT : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}]
->;
-
-def COND_OGE : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}]
->;
-
-def COND_OLT : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}]
->;
-
-def COND_OLE : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
->;
-
-
-def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
-def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
-
-//===----------------------------------------------------------------------===//
-// PatLeafs for unsigned / unordered comparisons
-//===----------------------------------------------------------------------===//
-
-def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>;
-def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>;
-def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>;
-def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>;
-def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>;
-def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>;
-
-// XXX - For some reason R600 version is preferring to use unordered
-// for setne?
-def COND_UNE_NE : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
->;
-
-//===----------------------------------------------------------------------===//
-// PatLeafs for signed comparisons
-//===----------------------------------------------------------------------===//
-
-def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>;
-def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>;
-def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>;
-def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>;
-
-//===----------------------------------------------------------------------===//
-// PatLeafs for integer equality
-//===----------------------------------------------------------------------===//
-
-def COND_EQ : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}]
->;
-
-def COND_NE : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}]
->;
-
-def COND_NULL : PatLeaf <
-  (cond),
-  [{(void)N; return false;}]
->;
-
-//===----------------------------------------------------------------------===//
-// Load/Store Pattern Fragments
-//===----------------------------------------------------------------------===//
-
-class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
-}]>;
-
-class PrivateLoad <SDPatternOperator op> : PrivateMemOp <
-  (ops node:$ptr), (op node:$ptr)
->;
-
-class PrivateStore <SDPatternOperator op> : PrivateMemOp <
-  (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
->;
-
-def load_private : PrivateLoad <load>;
-
-def truncstorei8_private : PrivateStore <truncstorei8>;
-def truncstorei16_private : PrivateStore <truncstorei16>;
-def store_private : PrivateStore <store>;
-
-def global_store : PatFrag<(ops node:$val, node:$ptr),
-    (store node:$val, node:$ptr), [{
-        return isGlobalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-// Global address space loads
-def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-// Constant address space loads
-def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
-
-class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
-                                              (ld_node node:$ptr), [{
-  LoadSDNode *L = cast<LoadSDNode>(N);
-  return L->getExtensionType() == ISD::ZEXTLOAD ||
-         L->getExtensionType() == ISD::EXTLOAD;
-}]>;
-
-def az_extload : AZExtLoadBase <unindexedload>;
-
-def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
-
-def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
-
-def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
-
-def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def extloadi8_private : PrivateLoad <az_extloadi8>;
-def sextloadi8_private : PrivateLoad <sextloadi8>;
-
-def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
-
-def az_extloadi16_global : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
-
-def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
-
-def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def extloadi16_private : PrivateLoad <az_extloadi16>;
-def sextloadi16_private : PrivateLoad <sextloadi16>;
-
-def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
-
-def az_extloadi32_global : PatFrag<(ops node:$ptr),
-                                   (az_extloadi32 node:$ptr), [{
-  return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def az_extloadi32_flat : PatFrag<(ops node:$ptr),
-                                   (az_extloadi32 node:$ptr), [{
-  return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def az_extloadi32_constant : PatFrag<(ops node:$ptr),
-                                     (az_extloadi32 node:$ptr), [{
-  return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
-
-def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei8 node:$val, node:$ptr), [{
-  return isGlobalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei16 node:$val, node:$ptr), [{
-  return isGlobalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei8 node:$val, node:$ptr), [{
-  return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei16 node:$val, node:$ptr), [{
-  return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def local_store : PatFrag<(ops node:$val, node:$ptr),
-                             (store node:$val, node:$ptr), [{
-  return isLocalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei8 node:$val, node:$ptr), [{
-  return isLocalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei16 node:$val, node:$ptr), [{
-  return isLocalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
-    return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
-}]>;
-
-def local_load_aligned8bytes : Aligned8Bytes <
-  (ops node:$ptr), (local_load node:$ptr)
->;
-
-def local_store_aligned8bytes : Aligned8Bytes <
-  (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr)
->;
-
-class local_binary_atomic_op<SDNode atomic_op> :
-  PatFrag<(ops node:$ptr, node:$value),
-    (atomic_op node:$ptr, node:$value), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
-
-
-def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
-def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>;
-def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>;
-def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>;
-def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>;
-def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>;
-def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>;
-def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>;
-def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>;
-def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>;
-def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
-
-def mskor_global : PatFrag<(ops node:$val, node:$ptr),
-                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
-}]>;
-
-multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
-
-  def _32_local : PatFrag <
-    (ops node:$ptr, node:$cmp, node:$swap),
-    (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
-      AtomicSDNode *AN = cast<AtomicSDNode>(N);
-      return AN->getMemoryVT() == MVT::i32 &&
-             AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-  }]>;
-
-  def _64_local : PatFrag<
-    (ops node:$ptr, node:$cmp, node:$swap),
-    (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
-      AtomicSDNode *AN = cast<AtomicSDNode>(N);
-      return AN->getMemoryVT() == MVT::i64 &&
-             AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-  }]>;
-}
-
-defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>;
-
-def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def flat_store : PatFrag<(ops node:$val, node:$ptr),
-                         (store node:$val, node:$ptr), [{
-  return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def mskor_flat : PatFrag<(ops node:$val, node:$ptr),
-                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
-}]>;
-
-class global_binary_atomic_op<SDNode atomic_op> : PatFrag<
-  (ops node:$ptr, node:$value),
-  (atomic_op node:$ptr, node:$value),
-  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]
->;
-
-def atomic_swap_global : global_binary_atomic_op<atomic_swap>;
-def atomic_add_global : global_binary_atomic_op<atomic_load_add>;
-def atomic_and_global : global_binary_atomic_op<atomic_load_and>;
-def atomic_max_global : global_binary_atomic_op<atomic_load_max>;
-def atomic_min_global : global_binary_atomic_op<atomic_load_min>;
-def atomic_or_global : global_binary_atomic_op<atomic_load_or>;
-def atomic_sub_global : global_binary_atomic_op<atomic_load_sub>;
-def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
-def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
-def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
-
-//===----------------------------------------------------------------------===//
-// Misc Pattern Fragments
-//===----------------------------------------------------------------------===//
-
-class Constants {
-int TWO_PI = 0x40c90fdb;
-int PI = 0x40490fdb;
-int TWO_PI_INV = 0x3e22f983;
-int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
-int FP32_NEG_ONE = 0xbf800000;
-int FP32_ONE = 0x3f800000;
-}
-def CONST : Constants;
-
-def FP_ZERO : PatLeaf <
-  (fpimm),
-  [{return N->getValueAPF().isZero();}]
->;
-
-def FP_ONE : PatLeaf <
-  (fpimm),
-  [{return N->isExactlyValue(1.0);}]
->;
-
-def FP_HALF : PatLeaf <
-  (fpimm),
-  [{return N->isExactlyValue(0.5);}]
->;
-
-let isCodeGenOnly = 1, isPseudo = 1 in {
-
-let usesCustomInserter = 1  in {
-
-class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
-  (outs rc:$dst),
-  (ins rc:$src0),
-  "CLAMP $dst, $src0",
-  [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
->;
-
-class FABS <RegisterClass rc> : AMDGPUShaderInst <
-  (outs rc:$dst),
-  (ins rc:$src0),
-  "FABS $dst, $src0",
-  [(set f32:$dst, (fabs f32:$src0))]
->;
-
-class FNEG <RegisterClass rc> : AMDGPUShaderInst <
-  (outs rc:$dst),
-  (ins rc:$src0),
-  "FNEG $dst, $src0",
-  [(set f32:$dst, (fneg f32:$src0))]
->;
-
-} // usesCustomInserter = 1
-
-multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
-                    ComplexPattern addrPat> {
-let UseNamedOperandTable = 1 in {
-
-  def RegisterLoad : AMDGPUShaderInst <
-    (outs dstClass:$dst),
-    (ins addrClass:$addr, i32imm:$chan),
-    "RegisterLoad $dst, $addr",
-    [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))]
-  > {
-    let isRegisterLoad = 1;
-  }
-
-  def RegisterStore : AMDGPUShaderInst <
-    (outs),
-    (ins dstClass:$val, addrClass:$addr, i32imm:$chan),
-    "RegisterStore $val, $addr",
-    [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))]
-  > {
-    let isRegisterStore = 1;
-  }
-}
-}
-
-} // End isCodeGenOnly = 1, isPseudo = 1
-
-/* Generic helper patterns for intrinsics */
-/* -------------------------------------- */
-
-class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul>
-  : Pat <
-  (fpow f32:$src0, f32:$src1),
-  (exp_ieee (mul f32:$src1, (log_ieee f32:$src0)))
->;
-
-/* Other helper patterns */
-/* --------------------- */
-
-/* Extract element pattern */
-class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
-                       SubRegIndex sub_reg>
-  : Pat<
-  (sub_type (vector_extract vec_type:$src, sub_idx)),
-  (EXTRACT_SUBREG $src, sub_reg)
->;
-
-/* Insert element pattern */
-class Insert_Element <ValueType elem_type, ValueType vec_type,
-                      int sub_idx, SubRegIndex sub_reg>
-  : Pat <
-  (vector_insert vec_type:$vec, elem_type:$elem, sub_idx),
-  (INSERT_SUBREG $vec, $elem, sub_reg)
->;
-
-// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
-// can handle COPY instructions.
-// bitconvert pattern
-class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat <
-  (dt (bitconvert (st rc:$src0))),
-  (dt rc:$src0)
->;
-
-// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
-// can handle COPY instructions.
-class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
-  (vt (AMDGPUdwordaddr (vt rc:$addr))),
-  (vt rc:$addr)
->;
-
-// BFI_INT patterns
-
-multiclass BFIPatterns <Instruction BFI_INT,
-                        Instruction LoadImm32,
-                        RegisterClass RC64> {
-  // Definition from ISA doc:
-  // (y & x) | (z & ~x)
-  def : Pat <
-    (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
-    (BFI_INT $x, $y, $z)
-  >;
-
-  // SHA-256 Ch function
-  // z ^ (x & (y ^ z))
-  def : Pat <
-    (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
-    (BFI_INT $x, $y, $z)
-  >;
-
-  def : Pat <
-    (fcopysign f32:$src0, f32:$src1),
-    (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1)
-  >;
-
-  def : Pat <
-    (f64 (fcopysign f64:$src0, f64:$src1)),
-    (REG_SEQUENCE RC64,
-      (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
-      (BFI_INT (LoadImm32 0x7fffffff),
-               (i32 (EXTRACT_SUBREG $src0, sub1)),
-               (i32 (EXTRACT_SUBREG $src1, sub1))), sub1)
-  >;
-}
-
-// SHA-256 Ma patterns
-
-// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
-class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
-  (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
-  (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
->;
-
-// Bitfield extract patterns
-
-def IMMZeroBasedBitfieldMask : PatLeaf <(imm), [{
-  return isMask_32(N->getZExtValue());
-}]>;
-
-def IMMPopCount : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N),
-                                   MVT::i32);
-}]>;
-
-class BFEPattern <Instruction BFE, Instruction MOV> : Pat <
-  (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)),
-  (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
->;
-
-// rotr pattern
-class ROTRPattern <Instruction BIT_ALIGN> : Pat <
-  (rotr i32:$src0, i32:$src1),
-  (BIT_ALIGN $src0, $src0, $src1)
->;
-
-// 24-bit arithmetic patterns
-def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>;
-
-// Special conversion patterns
-
-def cvt_rpi_i32_f32 : PatFrag <
-  (ops node:$src),
-  (fp_to_sint (ffloor (fadd $src, FP_HALF))),
-  [{ (void) N; return TM.Options.NoNaNsFPMath; }]
->;
-
-def cvt_flr_i32_f32 : PatFrag <
-  (ops node:$src),
-  (fp_to_sint (ffloor $src)),
-  [{ (void)N; return TM.Options.NoNaNsFPMath; }]
->;
-
-/*
-class UMUL24Pattern <Instruction UMUL24> : Pat <
-  (mul U24:$x, U24:$y),
-  (UMUL24 $x, $y)
->;
-*/
-
-class IMad24Pat<Instruction Inst> : Pat <
-  (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
-  (Inst $src0, $src1, $src2)
->;
-
-class UMad24Pat<Instruction Inst> : Pat <
-  (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2),
-  (Inst $src0, $src1, $src2)
->;
-
-multiclass Expand24IBitOps<Instruction MulInst, Instruction AddInst> {
-  def _expand_imad24 : Pat <
-    (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2),
-    (AddInst (MulInst $src0, $src1), $src2)
-  >;
-
-  def _expand_imul24 : Pat <
-    (AMDGPUmul_i24 i32:$src0, i32:$src1),
-    (MulInst $src0, $src1)
-  >;
-}
-
-multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> {
-  def _expand_umad24 : Pat <
-    (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2),
-    (AddInst (MulInst $src0, $src1), $src2)
-  >;
-
-  def _expand_umul24 : Pat <
-    (AMDGPUmul_u24 i32:$src0, i32:$src1),
-    (MulInst $src0, $src1)
-  >;
-}
-
-class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
-  (fdiv FP_ONE, vt:$src),
-  (RcpInst $src)
->;
-
-class RsqPat<Instruction RsqInst, ValueType vt> : Pat <
-  (AMDGPUrcp (fsqrt vt:$src)),
-  (RsqInst $src)
->;
-
-include "R600Instructions.td"
-include "R700Instructions.td"
-include "EvergreenInstructions.td"
-include "CaymanInstructions.td"
-
-include "SIInstrInfo.td"
-
diff --git a/lib/Target/R600/AMDGPUIntrinsicInfo.cpp b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp
deleted file mode 100644
index e94bb6013d8..00000000000
--- a/lib/Target/R600/AMDGPUIntrinsicInfo.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief AMDGPU Implementation of the IntrinsicInfo class.
-//
-//===-----------------------------------------------------------------------===//
-
-#include "AMDGPUIntrinsicInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-
-using namespace llvm;
-
-#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
-#include "AMDGPUGenIntrinsics.inc"
-#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
-
-AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo()
-    : TargetIntrinsicInfo() {}
-
-std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
-                                         unsigned numTys) const {
-  static const char *const names[] = {
-#define GET_INTRINSIC_NAME_TABLE
-#include "AMDGPUGenIntrinsics.inc"
-#undef GET_INTRINSIC_NAME_TABLE
-  };
-
-  if (IntrID < Intrinsic::num_intrinsics) {
-    return nullptr;
-  }
-  assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
-         "Invalid intrinsic ID");
-
-  std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
-  return Result;
-}
-
-unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name,
-                                         unsigned Len) const {
-  if (!StringRef(Name, Len).startswith("llvm."))
-    return 0; // All intrinsics start with 'llvm.'
-
-#define GET_FUNCTION_RECOGNIZER
-#include "AMDGPUGenIntrinsics.inc"
-#undef GET_FUNCTION_RECOGNIZER
-  AMDGPUIntrinsic::ID IntrinsicID =
-      (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
-  IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name);
-
-  if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) {
-    return IntrinsicID;
-  }
-  return 0;
-}
-
-bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
-// Overload Table
-#define GET_INTRINSIC_OVERLOAD_TABLE
-#include "AMDGPUGenIntrinsics.inc"
-#undef GET_INTRINSIC_OVERLOAD_TABLE
-}
-
-Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
-                                              Type **Tys,
-                                              unsigned numTys) const {
-  llvm_unreachable("Not implemented");
-}
diff --git a/lib/Target/R600/AMDGPUIntrinsicInfo.h b/lib/Target/R600/AMDGPUIntrinsicInfo.h
deleted file mode 100644
index 4c95b5ec097..00000000000
--- a/lib/Target/R600/AMDGPUIntrinsicInfo.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
-//
-//===-----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H
-#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H
-
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
-
-namespace llvm {
-class TargetMachine;
-
-namespace AMDGPUIntrinsic {
-enum ID {
-  last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
-#define GET_INTRINSIC_ENUM_VALUES
-#include "AMDGPUGenIntrinsics.inc"
-#undef GET_INTRINSIC_ENUM_VALUES
-      , num_AMDGPU_intrinsics
-};
-
-} // end namespace AMDGPUIntrinsic
-
-class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
-public:
-  AMDGPUIntrinsicInfo();
-  std::string getName(unsigned IntrId, Type **Tys = nullptr,
-                      unsigned numTys = 0) const override;
-  unsigned lookupName(const char *Name, unsigned Len) const override;
-  bool isOverloaded(unsigned IID) const override;
-  Function *getDeclaration(Module *M, unsigned ID,
-                           Type **Tys = nullptr,
-                           unsigned numTys = 0) const override;
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
deleted file mode 100644
index ab489cd2a4a..00000000000
--- a/lib/Target/R600/AMDGPUIntrinsics.td
+++ /dev/null
@@ -1,90 +0,0 @@
-//===-- AMDGPUIntrinsics.td - Common intrinsics  -*- tablegen -*-----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines intrinsics that are used by all hw codegen targets.
-//
-//===----------------------------------------------------------------------===//
-
-let TargetPrefix = "AMDGPU", isTarget = 1 in {
-
-  def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
-  def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-
-  // This is named backwards (instead of rsq_legacy) so we don't have
-  // to define it with the public builtins intrinsics. This is a
-  // workaround for how intrinsic names are parsed. If the name is
-  // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant
-  // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name.
-  def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-
-  def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-  def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
-  def int_AMDGPU_kilp : Intrinsic<[], [], []>;
-  def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_barrier_local  : Intrinsic<[], [], []>;
-  def int_AMDGPU_barrier_global  : Intrinsic<[], [], []>;
-}
-
-// Legacy names for compatibility.
-let TargetPrefix = "AMDIL", isTarget = 1 in {
-  def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-  def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-}
-
-let TargetPrefix = "TGSI", isTarget = 1 in {
-
-  def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>;
-}
-
-include "SIIntrinsics.td"
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
deleted file mode 100644
index 20831460b93..00000000000
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
-//
-//===----------------------------------------------------------------------===//
-//
-
-#include "AMDGPUMCInstLower.h"
-#include "AMDGPUAsmPrinter.h"
-#include "AMDGPUTargetMachine.h"
-#include "InstPrinter/AMDGPUInstPrinter.h"
-#include "R600InstrInfo.h"
-#include "SIInstrInfo.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
-#include <algorithm>
-
-using namespace llvm;
-
-AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st):
-  Ctx(ctx), ST(st)
-{ }
-
-void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
-
-  int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
-
-  if (MCOpcode == -1) {
-    LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
-    C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
-                "a target-specific version: " + Twine(MI->getOpcode()));
-  }
-
-  OutMI.setOpcode(MCOpcode);
-
-  for (const MachineOperand &MO : MI->explicit_operands()) {
-    MCOperand MCOp;
-    switch (MO.getType()) {
-    default:
-      llvm_unreachable("unknown operand type");
-    case MachineOperand::MO_Immediate:
-      MCOp = MCOperand::createImm(MO.getImm());
-      break;
-    case MachineOperand::MO_Register:
-      MCOp = MCOperand::createReg(MO.getReg());
-      break;
-    case MachineOperand::MO_MachineBasicBlock:
-      MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
-                                   MO.getMBB()->getSymbol(), Ctx));
-      break;
-    case MachineOperand::MO_GlobalAddress: {
-      const GlobalValue *GV = MO.getGlobal();
-      MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName()));
-      MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx));
-      break;
-    }
-    case MachineOperand::MO_TargetIndex: {
-      assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START);
-      MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
-      MCOp = MCOperand::createExpr(Expr);
-      break;
-    }
-    case MachineOperand::MO_ExternalSymbol: {
-      MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
-      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
-      MCOp = MCOperand::createExpr(Expr);
-      break;
-    }
-    }
-    OutMI.addOperand(MCOp);
-  }
-}
-
-void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
-  AMDGPUMCInstLower MCInstLowering(OutContext, STI);
-
-#ifdef _DEBUG
-  StringRef Err;
-  if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) {
-    errs() << "Warning: Illegal instruction detected: " << Err << "\n";
-    MI->dump();
-  }
-#endif
-  if (MI->isBundle()) {
-    const MachineBasicBlock *MBB = MI->getParent();
-    MachineBasicBlock::const_instr_iterator I = MI;
-    ++I;
-    while (I != MBB->end() && I->isInsideBundle()) {
-      EmitInstruction(I);
-      ++I;
-    }
-  } else {
-    MCInst TmpInst;
-    MCInstLowering.lower(MI, TmpInst);
-    EmitToStreamer(*OutStreamer, TmpInst);
-
-    if (STI.dumpCode()) {
-      // Disassemble instruction/operands to text.
-      DisasmLines.resize(DisasmLines.size() + 1);
-      std::string &DisasmLine = DisasmLines.back();
-      raw_string_ostream DisasmStream(DisasmLine);
-
-      AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(),
-                                    *MF->getSubtarget().getInstrInfo(),
-                                    *MF->getSubtarget().getRegisterInfo());
-      InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(),
-                            MF->getSubtarget());
-
-      // Disassemble instruction/operands to hex representation.
-      SmallVector<MCFixup, 4> Fixups;
-      SmallVector<char, 16> CodeBytes;
-      raw_svector_ostream CodeStream(CodeBytes);
-
-      auto &ObjStreamer = static_cast<MCObjectStreamer&>(*OutStreamer);
-      MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
-      InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups,
-                                    MF->getSubtarget<MCSubtargetInfo>());
-      CodeStream.flush();
-
-      HexLines.resize(HexLines.size() + 1);
-      std::string &HexLine = HexLines.back();
-      raw_string_ostream HexStream(HexLine);
-
-      for (size_t i = 0; i < CodeBytes.size(); i += 4) {
-        unsigned int CodeDWord = *(unsigned int *)&CodeBytes[i];
-        HexStream << format("%s%08X", (i > 0 ? " " : ""), CodeDWord);
-      }
-
-      DisasmStream.flush();
-      DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLine.size());
-    }
-  }
-}
diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h
deleted file mode 100644
index d322fe072b2..00000000000
--- a/lib/Target/R600/AMDGPUMCInstLower.h
+++ /dev/null
@@ -1,35 +0,0 @@
-//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H
-#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H
-
-namespace llvm {
-
-class AMDGPUSubtarget;
-class MachineInstr;
-class MCContext;
-class MCInst;
-
-class AMDGPUMCInstLower {
-  MCContext &Ctx;
-  const AMDGPUSubtarget &ST;
-
-public:
-  AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST);
-
-  /// \brief Lower a MachineInstr to an MCInst
-  void lower(const MachineInstr *MI, MCInst &OutMI) const;
-
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp
deleted file mode 100644
index 21c7da66323..00000000000
--- a/lib/Target/R600/AMDGPUMachineFunction.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-#include "AMDGPUMachineFunction.h"
-#include "AMDGPU.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Function.h"
-using namespace llvm;
-
-static const char *const ShaderTypeAttribute = "ShaderType";
-
-// Pin the vtable to this file.
-void AMDGPUMachineFunction::anchor() {}
-
-AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
-  MachineFunctionInfo(),
-  ShaderType(ShaderType::COMPUTE),
-  LDSSize(0),
-  ScratchSize(0),
-  IsKernel(true) {
-  Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute);
-
-  if (A.isStringAttribute()) {
-    StringRef Str = A.getValueAsString();
-    if (Str.getAsInteger(0, ShaderType))
-      llvm_unreachable("Can't parse shader type!");
-  }
-}
diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h
deleted file mode 100644
index f5e4694e76f..00000000000
--- a/lib/Target/R600/AMDGPUMachineFunction.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H
-#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H
-
-#include "llvm/CodeGen/MachineFunction.h"
-#include <map>
-
-namespace llvm {
-
-class AMDGPUMachineFunction : public MachineFunctionInfo {
-  virtual void anchor();
-  unsigned ShaderType;
-
-public:
-  AMDGPUMachineFunction(const MachineFunction &MF);
-  /// A map to keep track of local memory objects and their offsets within
-  /// the local memory space.
-  std::map<const GlobalValue *, unsigned> LocalMemoryObjects;
-  /// Number of bytes in the LDS that are being used.
-  unsigned LDSSize;
-
-  /// Start of implicit kernel args
-  unsigned ABIArgOffset;
-
-  unsigned getShaderType() const {
-    return ShaderType;
-  }
-
-  unsigned ScratchSize;
-  bool IsKernel;
-};
-
-}
-#endif
diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp
deleted file mode 100644
index 4a65bfc57f1..00000000000
--- a/lib/Target/R600/AMDGPUPromoteAlloca.cpp
+++ /dev/null
@@ -1,407 +0,0 @@
-//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass eliminates allocas by either converting them into vectors or
-// by migrating them to local address space.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-#define DEBUG_TYPE "amdgpu-promote-alloca"
-
-using namespace llvm;
-
-namespace {
-
-class AMDGPUPromoteAlloca : public FunctionPass,
-                       public InstVisitor<AMDGPUPromoteAlloca> {
-
-  static char ID;
-  Module *Mod;
-  const AMDGPUSubtarget &ST;
-  int LocalMemAvailable;
-
-public:
-  AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
-                                                   LocalMemAvailable(0) { }
-  bool doInitialization(Module &M) override;
-  bool runOnFunction(Function &F) override;
-  const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
-  void visitAlloca(AllocaInst &I);
-};
-
-} // End anonymous namespace
-
-char AMDGPUPromoteAlloca::ID = 0;
-
-bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
-  Mod = &M;
-  return false;
-}
-
-bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
-
-  const FunctionType *FTy = F.getFunctionType();
-
-  LocalMemAvailable = ST.getLocalMemorySize();
-
-
-  // If the function has any arguments in the local address space, then it's
-  // possible these arguments require the entire local memory space, so
-  // we cannot use local memory in the pass.
-  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
-    const Type *ParamTy = FTy->getParamType(i);
-    if (ParamTy->isPointerTy() &&
-        ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-      LocalMemAvailable = 0;
-      DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
-                      "local memory disabled.\n");
-      break;
-    }
-  }
-
-  if (LocalMemAvailable > 0) {
-    // Check how much local memory is being used by global objects
-    for (Module::global_iterator I = Mod->global_begin(),
-                                 E = Mod->global_end(); I != E; ++I) {
-      GlobalVariable *GV = I;
-      PointerType *GVTy = GV->getType();
-      if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
-        continue;
-      for (Value::use_iterator U = GV->use_begin(),
-                               UE = GV->use_end(); U != UE; ++U) {
-        Instruction *Use = dyn_cast<Instruction>(*U);
-        if (!Use)
-          continue;
-        if (Use->getParent()->getParent() == &F)
-          LocalMemAvailable -=
-              Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType());
-      }
-    }
-  }
-
-  LocalMemAvailable = std::max(0, LocalMemAvailable);
-  DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
-
-  visit(F);
-
-  return false;
-}
-
-static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
-  return VectorType::get(ArrayTy->getArrayElementType(),
-                         ArrayTy->getArrayNumElements());
-}
-
-static Value *
-calculateVectorIndex(Value *Ptr,
-                     const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
-  if (isa<AllocaInst>(Ptr))
-    return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
-
-  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
-
-  auto I = GEPIdx.find(GEP);
-  return I == GEPIdx.end() ? nullptr : I->second;
-}
-
-static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
-  // FIXME we only support simple cases
-  if (GEP->getNumOperands() != 3)
-    return NULL;
-
-  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
-  if (!I0 || !I0->isZero())
-    return NULL;
-
-  return GEP->getOperand(2);
-}
-
-// Not an instruction handled below to turn into a vector.
-//
-// TODO: Check isTriviallyVectorizable for calls and handle other
-// instructions.
-static bool canVectorizeInst(Instruction *Inst) {
-  switch (Inst->getOpcode()) {
-  case Instruction::Load:
-  case Instruction::Store:
-  case Instruction::BitCast:
-  case Instruction::AddrSpaceCast:
-    return true;
-  default:
-    return false;
-  }
-}
-
-static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
-  Type *AllocaTy = Alloca->getAllocatedType();
-
-  DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
-
-  // FIXME: There is no reason why we can't support larger arrays, we
-  // are just being conservative for now.
-  if (!AllocaTy->isArrayTy() ||
-      AllocaTy->getArrayElementType()->isVectorTy() ||
-      AllocaTy->getArrayNumElements() > 4) {
-
-    DEBUG(dbgs() << "  Cannot convert type to vector");
-    return false;
-  }
-
-  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
-  std::vector<Value*> WorkList;
-  for (User *AllocaUser : Alloca->users()) {
-    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
-    if (!GEP) {
-      if (!canVectorizeInst(cast<Instruction>(AllocaUser)))
-        return false;
-
-      WorkList.push_back(AllocaUser);
-      continue;
-    }
-
-    Value *Index = GEPToVectorIndex(GEP);
-
-    // If we can't compute a vector index from this GEP, then we can't
-    // promote this alloca to vector.
-    if (!Index) {
-      DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
-      return false;
-    }
-
-    GEPVectorIdx[GEP] = Index;
-    for (User *GEPUser : AllocaUser->users()) {
-      if (!canVectorizeInst(cast<Instruction>(GEPUser)))
-        return false;
-
-      WorkList.push_back(GEPUser);
-    }
-  }
-
-  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
-
-  DEBUG(dbgs() << "  Converting alloca to vector "
-        << *AllocaTy << " -> " << *VectorTy << '\n');
-
-  for (std::vector<Value*>::iterator I = WorkList.begin(),
-                                     E = WorkList.end(); I != E; ++I) {
-    Instruction *Inst = cast<Instruction>(*I);
-    IRBuilder<> Builder(Inst);
-    switch (Inst->getOpcode()) {
-    case Instruction::Load: {
-      Value *Ptr = Inst->getOperand(0);
-      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
-      Value *VecValue = Builder.CreateLoad(BitCast);
-      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
-      Inst->replaceAllUsesWith(ExtractElement);
-      Inst->eraseFromParent();
-      break;
-    }
-    case Instruction::Store: {
-      Value *Ptr = Inst->getOperand(1);
-      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
-      Value *VecValue = Builder.CreateLoad(BitCast);
-      Value *NewVecValue = Builder.CreateInsertElement(VecValue,
-                                                       Inst->getOperand(0),
-                                                       Index);
-      Builder.CreateStore(NewVecValue, BitCast);
-      Inst->eraseFromParent();
-      break;
-    }
-    case Instruction::BitCast:
-    case Instruction::AddrSpaceCast:
-      break;
-
-    default:
-      Inst->dump();
-      llvm_unreachable("Inconsistency in instructions promotable to vector");
-    }
-  }
-  return true;
-}
-
-static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
-  bool Success = true;
-  for (User *User : Val->users()) {
-    if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
-      continue;
-    if (isa<CallInst>(User)) {
-      WorkList.push_back(User);
-      continue;
-    }
-
-    // FIXME: Correctly handle ptrtoint instructions.
-    Instruction *UseInst = dyn_cast<Instruction>(User);
-    if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
-      return false;
-
-    if (!User->getType()->isPointerTy())
-      continue;
-
-    WorkList.push_back(User);
-
-    Success &= collectUsesWithPtrTypes(User, WorkList);
-  }
-  return Success;
-}
-
-void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
-  IRBuilder<> Builder(&I);
-
-  // First try to replace the alloca with a vector
-  Type *AllocaTy = I.getAllocatedType();
-
-  DEBUG(dbgs() << "Trying to promote " << I << '\n');
-
-  if (tryPromoteAllocaToVector(&I))
-    return;
-
-  DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
-
-  // FIXME: This is the maximum work group size.  We should try to get
-  // value from the reqd_work_group_size function attribute if it is
-  // available.
-  unsigned WorkGroupSize = 256;
-  int AllocaSize =
-      WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
-
-  if (AllocaSize > LocalMemAvailable) {
-    DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
-    return;
-  }
-
-  std::vector<Value*> WorkList;
-
-  if (!collectUsesWithPtrTypes(&I, WorkList)) {
-    DEBUG(dbgs() << " Do not know how to convert all uses\n");
-    return;
-  }
-
-  DEBUG(dbgs() << "Promoting alloca to local memory\n");
-  LocalMemAvailable -= AllocaSize;
-
-  Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
-  GlobalVariable *GV = new GlobalVariable(
-      *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0,
-      GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
-
-  FunctionType *FTy = FunctionType::get(
-      Type::getInt32Ty(Mod->getContext()), false);
-  AttributeSet AttrSet;
-  AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
-
-  Value *ReadLocalSizeY = Mod->getOrInsertFunction(
-      "llvm.r600.read.local.size.y", FTy, AttrSet);
-  Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
-      "llvm.r600.read.local.size.z", FTy, AttrSet);
-  Value *ReadTIDIGX = Mod->getOrInsertFunction(
-      "llvm.r600.read.tidig.x", FTy, AttrSet);
-  Value *ReadTIDIGY = Mod->getOrInsertFunction(
-      "llvm.r600.read.tidig.y", FTy, AttrSet);
-  Value *ReadTIDIGZ = Mod->getOrInsertFunction(
-      "llvm.r600.read.tidig.z", FTy, AttrSet);
-
-  Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {});
-  Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {});
-  Value *TIdX = Builder.CreateCall(ReadTIDIGX, {});
-  Value *TIdY = Builder.CreateCall(ReadTIDIGY, {});
-  Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {});
-
-  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
-  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
-  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
-  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
-  TID = Builder.CreateAdd(TID, TIdZ);
-
-  std::vector<Value*> Indices;
-  Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
-  Indices.push_back(TID);
-
-  Value *Offset = Builder.CreateGEP(GVTy, GV, Indices);
-  I.mutateType(Offset->getType());
-  I.replaceAllUsesWith(Offset);
-  I.eraseFromParent();
-
-  for (std::vector<Value*>::iterator i = WorkList.begin(),
-                                     e = WorkList.end(); i != e; ++i) {
-    Value *V = *i;
-    CallInst *Call = dyn_cast<CallInst>(V);
-    if (!Call) {
-      Type *EltTy = V->getType()->getPointerElementType();
-      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
-
-      // The operand's value should be corrected on its own.
-      if (isa<AddrSpaceCastInst>(V))
-        continue;
-
-      // FIXME: It doesn't really make sense to try to do this for all
-      // instructions.
-      V->mutateType(NewTy);
-      continue;
-    }
-
-    IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
-    if (!Intr) {
-      std::vector<Type*> ArgTypes;
-      for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
-                                ArgIdx != ArgEnd; ++ArgIdx) {
-        ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
-      }
-      Function *F = Call->getCalledFunction();
-      FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
-                                                F->isVarArg());
-      Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
-                                             NewType, F->getAttributes());
-      Function *NewF = cast<Function>(C);
-      Call->setCalledFunction(NewF);
-      continue;
-    }
-
-    Builder.SetInsertPoint(Intr);
-    switch (Intr->getIntrinsicID()) {
-    case Intrinsic::lifetime_start:
-    case Intrinsic::lifetime_end:
-      // These intrinsics are for address space 0 only
-      Intr->eraseFromParent();
-      continue;
-    case Intrinsic::memcpy: {
-      MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
-      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
-                           MemCpy->getLength(), MemCpy->getAlignment(),
-                           MemCpy->isVolatile());
-      Intr->eraseFromParent();
-      continue;
-    }
-    case Intrinsic::memset: {
-      MemSetInst *MemSet = cast<MemSetInst>(Intr);
-      Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
-                           MemSet->getLength(), MemSet->getAlignment(),
-                           MemSet->isVolatile());
-      Intr->eraseFromParent();
-      continue;
-    }
-    default:
-      Intr->dump();
-      llvm_unreachable("Don't know how to promote alloca intrinsic use.");
-    }
-  }
-}
-
-FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
-  return new AMDGPUPromoteAlloca(ST);
-}
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
deleted file mode 100644
index 3ca0eca3417..00000000000
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Parent TargetRegisterInfo class common to all hw codegen targets.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPURegisterInfo.h"
-#include "AMDGPUTargetMachine.h"
-
-using namespace llvm;
-
-AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
-
-//===----------------------------------------------------------------------===//
-// Function handling callbacks - Functions are a seldom used feature of GPUS, so
-// they are not supported at this time.
-//===----------------------------------------------------------------------===//
-
-const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
-
-const MCPhysReg*
-AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  return &CalleeSavedReg;
-}
-
-void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
-                                             int SPAdj,
-                                             unsigned FIOperandNum,
-                                             RegScavenger *RS) const {
-  llvm_unreachable("Subroutines not supported yet");
-}
-
-unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  return AMDGPU::NoRegister;
-}
-
-unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
-  static const unsigned SubRegs[] = {
-    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
-    AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
-    AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14,
-    AMDGPU::sub15
-  };
-
-  assert(Channel < array_lengthof(SubRegs));
-  return SubRegs[Channel];
-}
-
-unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const {
-
-  return getSubRegFromChannel(IndirectIndex);
-}
-
-#define GET_REGINFO_TARGET_DESC
-#include "AMDGPUGenRegisterInfo.inc"
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
deleted file mode 100644
index cfd800bdc70..00000000000
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ /dev/null
@@ -1,64 +0,0 @@
-//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief TargetRegisterInfo interface that is implemented by all hw codegen
-/// targets.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H
-#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H
-
-#include "llvm/ADT/BitVector.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-
-#define GET_REGINFO_HEADER
-#define GET_REGINFO_ENUM
-#include "AMDGPUGenRegisterInfo.inc"
-
-namespace llvm {
-
-class AMDGPUSubtarget;
-class TargetInstrInfo;
-
-struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
-  static const MCPhysReg CalleeSavedReg;
-
-  AMDGPURegisterInfo();
-
-  BitVector getReservedRegs(const MachineFunction &MF) const override {
-    assert(!"Unimplemented");  return BitVector();
-  }
-
-  virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
-    assert(!"Unimplemented"); return nullptr;
-  }
-
-  virtual unsigned getHWRegIndex(unsigned Reg) const {
-    assert(!"Unimplemented"); return 0;
-  }
-
-  /// \returns the sub reg enum value for the given \p Channel
-  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
-  unsigned getSubRegFromChannel(unsigned Channel) const;
-
-  const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
-  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
-                           unsigned FIOperandNum,
-                           RegScavenger *RS) const override;
-  unsigned getFrameRegister(const MachineFunction &MF) const override;
-
-  unsigned getIndirectSubReg(unsigned IndirectIndex) const;
-
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/lib/Target/R600/AMDGPURegisterInfo.td b/lib/Target/R600/AMDGPURegisterInfo.td
deleted file mode 100644
index 835a1464395..00000000000
--- a/lib/Target/R600/AMDGPURegisterInfo.td
+++ /dev/null
@@ -1,26 +0,0 @@
-//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Tablegen register definitions common to all hw codegen targets.
-//
-//===----------------------------------------------------------------------===//
-
-let Namespace = "AMDGPU" in {
-
-foreach Index = 0-15 in {
-  // Indices are used in a variety of ways here, so don't set a size/offset.
-  def sub#Index : SubRegIndex<-1, -1>;
-}
-
-def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">;
-
-}
-
-include "R600RegisterInfo.td"
-include "SIRegisterInfo.td"
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
deleted file mode 100644
index 605ccd0e136..00000000000
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUSubtarget.h"
-#include "R600ISelLowering.h"
-#include "R600InstrInfo.h"
-#include "R600MachineScheduler.h"
-#include "SIISelLowering.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/CodeGen/MachineScheduler.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "amdgpu-subtarget"
-
-#define GET_SUBTARGETINFO_ENUM
-#define GET_SUBTARGETINFO_TARGET_DESC
-#define GET_SUBTARGETINFO_CTOR
-#include "AMDGPUGenSubtargetInfo.inc"
-
-AMDGPUSubtarget &
-AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
-                                                 StringRef GPU, StringRef FS) {
-  // Determine default and user-specified characteristics
-  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
-  // enabled, but some instructions do not respect them and they run at the
-  // double precision rate, so don't enable by default.
-  //
-  // We want to be able to turn these off, but making this a subtarget feature
-  // for SI has the unhelpful behavior that it unsets everything else if you
-  // disable it.
-
-  SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
-  FullFS += FS;
-
-  if (GPU == "" && TT.getArch() == Triple::amdgcn)
-    GPU = "SI";
-
-  ParseSubtargetFeatures(GPU, FullFS);
-
-  // FIXME: I don't think think Evergreen has any useful support for
-  // denormals, but should be checked. Should we issue a warning somewhere
-  // if someone tries to enable these?
-  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    FP32Denormals = false;
-    FP64Denormals = false;
-  }
-  return *this;
-}
-
-AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                                 TargetMachine &TM)
-    : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false),
-      DumpCode(false), R600ALUInst(false), HasVertexCache(false),
-      TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
-      FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
-      CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),
-      EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false),
-      WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
-      EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
-      GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
-      FrameLowering(TargetFrameLowering::StackGrowsUp,
-                    64 * 16, // Maximum stack alignment (long16)
-                    0),
-      InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
-
-  initializeSubtargetDependencies(TT, GPU, FS);
-
-  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    InstrInfo.reset(new R600InstrInfo(*this));
-    TLInfo.reset(new R600TargetLowering(TM, *this));
-  } else {
-    InstrInfo.reset(new SIInstrInfo(*this));
-    TLInfo.reset(new SITargetLowering(TM, *this));
-  }
-}
-
-unsigned AMDGPUSubtarget::getStackEntrySize() const {
-  assert(getGeneration() <= NORTHERN_ISLANDS);
-  switch(getWavefrontSize()) {
-  case 16:
-    return 8;
-  case 32:
-    return hasCaymanISA() ? 4 : 8;
-  case 64:
-    return 4;
-  default:
-    llvm_unreachable("Illegal wavefront size.");
-  }
-}
-
-unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const {
-  switch(getGeneration()) {
-  default: llvm_unreachable("ChipID unknown");
-  case SEA_ISLANDS: return 12;
-  }
-}
-
-bool AMDGPUSubtarget::isVGPRSpillingEnabled(
-                                       const SIMachineFunctionInfo *MFI) const {
-  return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling;
-}
-
-void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                          MachineInstr *begin,
-                                          MachineInstr *end,
-                                          unsigned NumRegionInstrs) const {
-  if (getGeneration() >= SOUTHERN_ISLANDS) {
-
-    // Track register pressure so the scheduler can try to decrease
-    // pressure once register usage is above the threshold defined by
-    // SIRegisterInfo::getRegPressureSetLimit()
-    Policy.ShouldTrackPressure = true;
-
-    // Enabling both top down and bottom up scheduling seems to give us less
-    // register spills than just using one of these approaches on its own.
-    Policy.OnlyTopDown = false;
-    Policy.OnlyBottomUp = false;
-  }
-}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
deleted file mode 100644
index 0d40d14f820..00000000000
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ /dev/null
@@ -1,282 +0,0 @@
-//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief AMDGPU specific subclass of TargetSubtarget.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
-#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
-#include "AMDGPU.h"
-#include "AMDGPUFrameLowering.h"
-#include "AMDGPUInstrInfo.h"
-#include "AMDGPUIntrinsicInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "R600ISelLowering.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-
-#define GET_SUBTARGETINFO_HEADER
-#include "AMDGPUGenSubtargetInfo.inc"
-
-namespace llvm {
-
-class SIMachineFunctionInfo;
-
-class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
-
-public:
-  enum Generation {
-    R600 = 0,
-    R700,
-    EVERGREEN,
-    NORTHERN_ISLANDS,
-    SOUTHERN_ISLANDS,
-    SEA_ISLANDS,
-    VOLCANIC_ISLANDS,
-  };
-
-  enum {
-    FIXED_SGPR_COUNT_FOR_INIT_BUG = 80
-  };
-
-private:
-  std::string DevName;
-  bool Is64bit;
-  bool DumpCode;
-  bool R600ALUInst;
-  bool HasVertexCache;
-  short TexVTXClauseSize;
-  Generation Gen;
-  bool FP64;
-  bool FP64Denormals;
-  bool FP32Denormals;
-  bool FastFMAF32;
-  bool CaymanISA;
-  bool FlatAddressSpace;
-  bool EnableIRStructurizer;
-  bool EnablePromoteAlloca;
-  bool EnableIfCvt;
-  bool EnableLoadStoreOpt;
-  unsigned WavefrontSize;
-  bool CFALUBug;
-  int LocalMemorySize;
-  bool EnableVGPRSpilling;
-  bool SGPRInitBug;
-  bool IsGCN;
-  bool GCN1Encoding;
-  bool GCN3Encoding;
-  bool CIInsts;
-  bool FeatureDisable;
-  int LDSBankCount;
-
-  AMDGPUFrameLowering FrameLowering;
-  std::unique_ptr<AMDGPUTargetLowering> TLInfo;
-  std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
-  InstrItineraryData InstrItins;
-  Triple TargetTriple;
-
-public:
-  AMDGPUSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
-                  TargetMachine &TM);
-  AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT,
-                                                   StringRef GPU, StringRef FS);
-
-  const AMDGPUFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
-  }
-  const AMDGPUInstrInfo *getInstrInfo() const override {
-    return InstrInfo.get();
-  }
-  const AMDGPURegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo->getRegisterInfo();
-  }
-  AMDGPUTargetLowering *getTargetLowering() const override {
-    return TLInfo.get();
-  }
-  const InstrItineraryData *getInstrItineraryData() const override {
-    return &InstrItins;
-  }
-
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-
-  bool is64bit() const {
-    return Is64bit;
-  }
-
-  bool hasVertexCache() const {
-    return HasVertexCache;
-  }
-
-  short getTexVTXClauseSize() const {
-    return TexVTXClauseSize;
-  }
-
-  Generation getGeneration() const {
-    return Gen;
-  }
-
-  bool hasHWFP64() const {
-    return FP64;
-  }
-
-  bool hasCaymanISA() const {
-    return CaymanISA;
-  }
-
-  bool hasFP32Denormals() const {
-    return FP32Denormals;
-  }
-
-  bool hasFP64Denormals() const {
-    return FP64Denormals;
-  }
-
-  bool hasFastFMAF32() const {
-    return FastFMAF32;
-  }
-
-  bool hasFlatAddressSpace() const {
-    return FlatAddressSpace;
-  }
-
-  bool hasBFE() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasBFI() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasBFM() const {
-    return hasBFE();
-  }
-
-  bool hasBCNT(unsigned Size) const {
-    if (Size == 32)
-      return (getGeneration() >= EVERGREEN);
-
-    if (Size == 64)
-      return (getGeneration() >= SOUTHERN_ISLANDS);
-
-    return false;
-  }
-
-  bool hasMulU24() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasMulI24() const {
-    return (getGeneration() >= SOUTHERN_ISLANDS ||
-            hasCaymanISA());
-  }
-
-  bool hasFFBL() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasFFBH() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasCARRY() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool hasBORROW() const {
-    return (getGeneration() >= EVERGREEN);
-  }
-
-  bool IsIRStructurizerEnabled() const {
-    return EnableIRStructurizer;
-  }
-
-  bool isPromoteAllocaEnabled() const {
-    return EnablePromoteAlloca;
-  }
-
-  bool isIfCvtEnabled() const {
-    return EnableIfCvt;
-  }
-
-  bool loadStoreOptEnabled() const {
-    return EnableLoadStoreOpt;
-  }
-
-  unsigned getWavefrontSize() const {
-    return WavefrontSize;
-  }
-
-  unsigned getStackEntrySize() const;
-
-  bool hasCFAluBug() const {
-    assert(getGeneration() <= NORTHERN_ISLANDS);
-    return CFALUBug;
-  }
-
-  int getLocalMemorySize() const {
-    return LocalMemorySize;
-  }
-
-  bool hasSGPRInitBug() const {
-    return SGPRInitBug;
-  }
-
-  int getLDSBankCount() const {
-    return LDSBankCount;
-  }
-
-  unsigned getAmdKernelCodeChipID() const;
-
-  bool enableMachineScheduler() const override {
-    return true;
-  }
-
-  void overrideSchedPolicy(MachineSchedPolicy &Policy,
-                           MachineInstr *begin, MachineInstr *end,
-                           unsigned NumRegionInstrs) const override;
-
-  // Helper functions to simplify if statements
-  bool isTargetELF() const {
-    return false;
-  }
-
-  StringRef getDeviceName() const {
-    return DevName;
-  }
-
-  bool dumpCode() const {
-    return DumpCode;
-  }
-  bool r600ALUEncoding() const {
-    return R600ALUInst;
-  }
-  bool isAmdHsaOS() const {
-    return TargetTriple.getOS() == Triple::AMDHSA;
-  }
-  bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const;
-
-  unsigned getMaxWavesPerCU() const {
-    if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
-      return 10;
-
-    // FIXME: Not sure what this is for other subtagets.
-    llvm_unreachable("do not know max waves per CU for this subtarget.");
-  }
-
-  bool enableSubRegLiveness() const override {
-    return true;
-  }
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
deleted file mode 100644
index d65c010888a..00000000000
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ /dev/null
@@ -1,292 +0,0 @@
-//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief The AMDGPU target machine contains all of the hardware specific
-/// information  needed to emit code for R600 and SI GPUs.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUTargetMachine.h"
-#include "AMDGPU.h"
-#include "AMDGPUTargetTransformInfo.h"
-#include "R600ISelLowering.h"
-#include "R600InstrInfo.h"
-#include "R600MachineScheduler.h"
-#include "SIISelLowering.h"
-#include "SIInstrInfo.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_os_ostream.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Scalar.h"
-#include <llvm/CodeGen/Passes.h>
-
-using namespace llvm;
-
-extern "C" void LLVMInitializeR600Target() {
-  // Register the target
-  RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
-  RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget);
-}
-
-static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>());
-}
-
-static MachineSchedRegistry
-SchedCustomRegistry("r600", "Run R600's custom scheduler",
-                    createR600MachineScheduler);
-
-static std::string computeDataLayout(const Triple &TT) {
-  std::string Ret = "e-p:32:32";
-
-  if (TT.getArch() == Triple::amdgcn) {
-    // 32-bit private, local, and region pointers. 64-bit global and constant.
-    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
-  }
-
-  Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
-         "-v512:512-v1024:1024-v2048:2048-n32:64";
-
-  return Ret;
-}
-
-AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
-                                         StringRef CPU, StringRef FS,
-                                         TargetOptions Options, Reloc::Model RM,
-                                         CodeModel::Model CM,
-                                         CodeGenOpt::Level OptLevel)
-    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM,
-                        OptLevel),
-      TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this),
-      IntrinsicInfo() {
-  setRequiresStructuredCFG(true);
-  initAsmInfo();
-}
-
-AMDGPUTargetMachine::~AMDGPUTargetMachine() {
-  delete TLOF;
-}
-
-//===----------------------------------------------------------------------===//
-// R600 Target Machine (R600 -> Cayman)
-//===----------------------------------------------------------------------===//
-
-R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
-                                     StringRef FS, StringRef CPU,
-                                     TargetOptions Options, Reloc::Model RM,
-                                     CodeModel::Model CM, CodeGenOpt::Level OL)
-    : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {}
-
-//===----------------------------------------------------------------------===//
-// GCN Target Machine (SI+)
-//===----------------------------------------------------------------------===//
-
-GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
-                                   StringRef FS, StringRef CPU,
-                                   TargetOptions Options, Reloc::Model RM,
-                                   CodeModel::Model CM, CodeGenOpt::Level OL)
-    : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {}
-
-//===----------------------------------------------------------------------===//
-// AMDGPU Pass Setup
-//===----------------------------------------------------------------------===//
-
-namespace {
-class AMDGPUPassConfig : public TargetPassConfig {
-public:
-  AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
-
-  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
-    return getTM<AMDGPUTargetMachine>();
-  }
-
-  ScheduleDAGInstrs *
-  createMachineScheduler(MachineSchedContext *C) const override {
-    const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
-    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-      return createR600MachineScheduler(C);
-    return nullptr;
-  }
-
-  void addIRPasses() override;
-  void addCodeGenPrepare() override;
-  virtual bool addPreISel() override;
-  virtual bool addInstSelector() override;
-};
-
-class R600PassConfig : public AMDGPUPassConfig {
-public:
-  R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
-    : AMDGPUPassConfig(TM, PM) { }
-
-  bool addPreISel() override;
-  void addPreRegAlloc() override;
-  void addPreSched2() override;
-  void addPreEmitPass() override;
-};
-
-class GCNPassConfig : public AMDGPUPassConfig {
-public:
-  GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
-    : AMDGPUPassConfig(TM, PM) { }
-  bool addPreISel() override;
-  bool addInstSelector() override;
-  void addPreRegAlloc() override;
-  void addPostRegAlloc() override;
-  void addPreSched2() override;
-  void addPreEmitPass() override;
-};
-
-} // End of anonymous namespace
-
-TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis(
-      [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); });
-}
-
-void AMDGPUPassConfig::addIRPasses() {
-  // Function calls are not supported, so make sure we inline everything.
-  addPass(createAMDGPUAlwaysInlinePass());
-  addPass(createAlwaysInlinerPass());
-  // We need to add the barrier noop pass, otherwise adding the function
-  // inlining pass will cause all of the PassConfigs passes to be run
-  // one function at a time, which means if we have a nodule with two
-  // functions, then we will generate code for the first function
-  // without ever running any passes on the second.
-  addPass(createBarrierNoopPass());
-  TargetPassConfig::addIRPasses();
-}
-
-void AMDGPUPassConfig::addCodeGenPrepare() {
-  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
-  if (ST.isPromoteAllocaEnabled()) {
-    addPass(createAMDGPUPromoteAlloca(ST));
-    addPass(createSROAPass());
-  }
-  TargetPassConfig::addCodeGenPrepare();
-}
-
-bool
-AMDGPUPassConfig::addPreISel() {
-  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
-  addPass(createFlattenCFGPass());
-  if (ST.IsIRStructurizerEnabled())
-    addPass(createStructurizeCFGPass());
-  return false;
-}
-
-bool AMDGPUPassConfig::addInstSelector() {
-  addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-// R600 Pass Setup
-//===----------------------------------------------------------------------===//
-
-bool R600PassConfig::addPreISel() {
-  AMDGPUPassConfig::addPreISel();
-  addPass(createR600TextureIntrinsicsReplacer());
-  return false;
-}
-
-void R600PassConfig::addPreRegAlloc() {
-  addPass(createR600VectorRegMerger(*TM));
-}
-
-void R600PassConfig::addPreSched2() {
-  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
-  addPass(createR600EmitClauseMarkers(), false);
-  if (ST.isIfCvtEnabled())
-    addPass(&IfConverterID, false);
-  addPass(createR600ClauseMergePass(*TM), false);
-}
-
-void R600PassConfig::addPreEmitPass() {
-  addPass(createAMDGPUCFGStructurizerPass(), false);
-  addPass(createR600ExpandSpecialInstrsPass(*TM), false);
-  addPass(&FinalizeMachineBundlesID, false);
-  addPass(createR600Packetizer(*TM), false);
-  addPass(createR600ControlFlowFinalizer(*TM), false);
-}
-
-TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new R600PassConfig(this, PM);
-}
-
-//===----------------------------------------------------------------------===//
-// GCN Pass Setup
-//===----------------------------------------------------------------------===//
-
-bool GCNPassConfig::addPreISel() {
-  AMDGPUPassConfig::addPreISel();
-  addPass(createSinkingPass());
-  addPass(createSITypeRewriter());
-  addPass(createSIAnnotateControlFlowPass());
-  return false;
-}
-
-bool GCNPassConfig::addInstSelector() {
-  AMDGPUPassConfig::addInstSelector();
-  addPass(createSILowerI1CopiesPass());
-  addPass(createSIFixSGPRCopiesPass(*TM));
-  addPass(createSIFoldOperandsPass());
-  return false;
-}
-
-void GCNPassConfig::addPreRegAlloc() {
-  const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
-
-  // This needs to be run directly before register allocation because
-  // earlier passes might recompute live intervals.
-  // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
-  if (getOptLevel() > CodeGenOpt::None) {
-    initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry());
-    insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
-  }
-
-  if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) {
-    // Don't do this with no optimizations since it throws away debug info by
-    // merging nonadjacent loads.
-
-    // This should be run after scheduling, but before register allocation. It
-    // also need extra copies to the address operand to be eliminated.
-    initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
-    insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
-  }
-  addPass(createSIShrinkInstructionsPass(), false);
-  addPass(createSIFixSGPRLiveRangesPass(), false);
-}
-
-void GCNPassConfig::addPostRegAlloc() {
-  addPass(createSIPrepareScratchRegs(), false);
-  addPass(createSIShrinkInstructionsPass(), false);
-}
-
-void GCNPassConfig::addPreSched2() {
-  addPass(createSIInsertWaits(*TM), false);
-}
-
-void GCNPassConfig::addPreEmitPass() {
-  addPass(createSILowerControlFlowPass(*TM), false);
-}
-
-TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new GCNPassConfig(this, PM);
-}
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
deleted file mode 100644
index 14792e347a7..00000000000
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ /dev/null
@@ -1,89 +0,0 @@
-//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H
-#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H
-
-#include "AMDGPUFrameLowering.h"
-#include "AMDGPUInstrInfo.h"
-#include "AMDGPUIntrinsicInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "R600ISelLowering.h"
-#include "llvm/IR/DataLayout.h"
-
-namespace llvm {
-
-//===----------------------------------------------------------------------===//
-// AMDGPU Target Machine (R600+)
-//===----------------------------------------------------------------------===//
-
-class AMDGPUTargetMachine : public LLVMTargetMachine {
-private:
-
-protected:
-  TargetLoweringObjectFile *TLOF;
-  AMDGPUSubtarget Subtarget;
-  AMDGPUIntrinsicInfo IntrinsicInfo;
-
-public:
-  AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef FS,
-                      StringRef CPU, TargetOptions Options, Reloc::Model RM,
-                      CodeModel::Model CM, CodeGenOpt::Level OL);
-  ~AMDGPUTargetMachine();
-
-  const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; }
-  const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override {
-    return &Subtarget;
-  }
-  const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
-    return &IntrinsicInfo;
-  }
-  TargetIRAnalysis getTargetIRAnalysis() override;
-
-  TargetLoweringObjectFile *getObjFileLowering() const override {
-    return TLOF;
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// R600 Target Machine (R600 -> Cayman)
-//===----------------------------------------------------------------------===//
-
-class R600TargetMachine : public AMDGPUTargetMachine {
-
-public:
-  R600TargetMachine(const Target &T, const Triple &TT, StringRef FS,
-                    StringRef CPU, TargetOptions Options, Reloc::Model RM,
-                    CodeModel::Model CM, CodeGenOpt::Level OL);
-
-  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-};
-
-//===----------------------------------------------------------------------===//
-// GCN Target Machine (SI+)
-//===----------------------------------------------------------------------===//
-
-class GCNTargetMachine : public AMDGPUTargetMachine {
-
-public:
-  GCNTargetMachine(const Target &T, const Triple &TT, StringRef FS,
-                   StringRef CPU, TargetOptions Options, Reloc::Model RM,
-                   CodeModel::Model CM, CodeGenOpt::Level OL);
-
-  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
deleted file mode 100644
index 6dacc742b12..00000000000
--- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// \file
-// This file implements a TargetTransformInfo analysis pass specific to the
-// AMDGPU target machine. It uses the target's detailed information to provide
-// more precise answers to certain TTI queries, while letting the target
-// independent and default TTI implementations handle the rest.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUTargetTransformInfo.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/CostTable.h"
-#include "llvm/Target/TargetLowering.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "AMDGPUtti"
-
-void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
-                                            TTI::UnrollingPreferences &UP) {
-  UP.Threshold = 300; // Twice the default.
-  UP.MaxCount = UINT_MAX;
-  UP.Partial = true;
-
-  // TODO: Do we want runtime unrolling?
-
-  for (const BasicBlock *BB : L->getBlocks()) {
-    const DataLayout &DL = BB->getModule()->getDataLayout();
-    for (const Instruction &I : *BB) {
-      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
-      if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
-        continue;
-
-      const Value *Ptr = GEP->getPointerOperand();
-      const AllocaInst *Alloca =
-          dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
-      if (Alloca) {
-        // We want to do whatever we can to limit the number of alloca
-        // instructions that make it through to the code generator.  allocas
-        // require us to use indirect addressing, which is slow and prone to
-        // compiler bugs.  If this loop does an address calculation on an
-        // alloca ptr, then we want to use a higher than normal loop unroll
-        // threshold. This will give SROA a better chance to eliminate these
-        // allocas.
-        //
-        // Don't use the maximum allowed value here as it will make some
-        // programs way too big.
-        UP.Threshold = 800;
-      }
-    }
-  }
-}
-
-unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
-  if (Vec)
-    return 0;
-
-  // Number of VGPRs on SI.
-  if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
-    return 256;
-
-  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
-}
-
-unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; }
-
-unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
-  // Semi-arbitrary large amount.
-  return 64;
-}
diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.h b/lib/Target/R600/AMDGPUTargetTransformInfo.h
deleted file mode 100644
index 791c84e6f28..00000000000
--- a/lib/Target/R600/AMDGPUTargetTransformInfo.h
+++ /dev/null
@@ -1,78 +0,0 @@
-//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file a TargetTransformInfo::Concept conforming object specific to the
-/// AMDGPU target machine. It uses the target's detailed information to
-/// provide more precise answers to certain TTI queries, while letting the
-/// target independent and default TTI implementations handle the rest.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H
-#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H
-
-#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/Target/TargetLowering.h"
-
-namespace llvm {
-
-class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> {
-  typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT;
-  typedef TargetTransformInfo TTI;
-  friend BaseT;
-
-  const AMDGPUSubtarget *ST;
-  const AMDGPUTargetLowering *TLI;
-
-  const AMDGPUSubtarget *getST() const { return ST; }
-  const AMDGPUTargetLowering *getTLI() const { return TLI; }
-
-public:
-  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM)
-      : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {}
-
-  // Provide value semantics. MSVC requires that we spell all of these out.
-  AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg)
-      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
-  AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg)
-      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
-        TLI(std::move(Arg.TLI)) {}
-  AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
-
-  bool hasBranchDivergence() { return true; }
-
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
-
-  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
-    assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
-    return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software;
-  }
-
-  unsigned getNumberOfRegisters(bool Vector);
-  unsigned getRegisterBitWidth(bool Vector);
-  unsigned getMaxInterleaveFactor(unsigned VF);
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
deleted file mode 100644
index c9b25a1a0b8..00000000000
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ /dev/null
@@ -1,1912 +0,0 @@
-//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUInstrInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "R600InstrInfo.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include <deque>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "structcfg"
-
-#define DEFAULT_VEC_SLOTS 8
-
-// TODO: move-begin.
-
-//===----------------------------------------------------------------------===//
-//
-// Statistics for CFGStructurizer.
-//
-//===----------------------------------------------------------------------===//
-
-STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
-    "matched");
-STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
-    "matched");
-STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
-    "pattern matched");
-STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
-STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
-
-namespace llvm {
-  void initializeAMDGPUCFGStructurizerPass(PassRegistry&);
-}
-
-//===----------------------------------------------------------------------===//
-//
-// Miscellaneous utility for CFGStructurizer.
-//
-//===----------------------------------------------------------------------===//
-namespace {
-#define SHOWNEWINSTR(i) \
-  DEBUG(dbgs() << "New instr: " << *i << "\n");
-
-#define SHOWNEWBLK(b, msg) \
-DEBUG( \
-  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
-  dbgs() << "\n"; \
-);
-
-#define SHOWBLK_DETAIL(b, msg) \
-DEBUG( \
-  if (b) { \
-  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
-  b->print(dbgs()); \
-  dbgs() << "\n"; \
-  } \
-);
-
-#define INVALIDSCCNUM -1
-
-template<class NodeT>
-void ReverseVector(SmallVectorImpl<NodeT *> &Src) {
-  size_t sz = Src.size();
-  for (size_t i = 0; i < sz/2; ++i) {
-    NodeT *t = Src[i];
-    Src[i] = Src[sz - i - 1];
-    Src[sz - i - 1] = t;
-  }
-}
-
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-//
-// supporting data structure for CFGStructurizer
-//
-//===----------------------------------------------------------------------===//
-
-
-namespace {
-
-class BlockInformation {
-public:
-  bool IsRetired;
-  int  SccNum;
-  BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {}
-};
-
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-//
-// CFGStructurizer
-//
-//===----------------------------------------------------------------------===//
-
-namespace {
-class AMDGPUCFGStructurizer : public MachineFunctionPass {
-public:
-  typedef SmallVector<MachineBasicBlock *, 32> MBBVector;
-  typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap;
-  typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap;
-
-  enum PathToKind {
-    Not_SinglePath = 0,
-    SinglePath_InPath = 1,
-    SinglePath_NotInPath = 2
-  };
-
-  static char ID;
-
-  AMDGPUCFGStructurizer() :
-      MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {
-    initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
-  }
-
-   const char *getPassName() const override {
-    return "AMDGPU Control Flow Graph structurizer Pass";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addPreserved<MachineFunctionAnalysis>();
-    AU.addRequired<MachineFunctionAnalysis>();
-    AU.addRequired<MachineDominatorTree>();
-    AU.addRequired<MachinePostDominatorTree>();
-    AU.addRequired<MachineLoopInfo>();
-  }
-
-  /// Perform the CFG structurization
-  bool run();
-
-  /// Perform the CFG preparation
-  /// This step will remove every unconditionnal/dead jump instructions and make
-  /// sure all loops have an exit block
-  bool prepare();
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
-    TRI = &TII->getRegisterInfo();
-    DEBUG(MF.dump(););
-    OrderedBlks.clear();
-    Visited.clear();
-    FuncRep = &MF;
-    MLI = &getAnalysis<MachineLoopInfo>();
-    DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
-    MDT = &getAnalysis<MachineDominatorTree>();
-    DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr););
-    PDT = &getAnalysis<MachinePostDominatorTree>();
-    DEBUG(PDT->print(dbgs()););
-    prepare();
-    run();
-    DEBUG(MF.dump(););
-    return true;
-  }
-
-protected:
-  MachineDominatorTree *MDT;
-  MachinePostDominatorTree *PDT;
-  MachineLoopInfo *MLI;
-  const R600InstrInfo *TII;
-  const AMDGPURegisterInfo *TRI;
-
-  // PRINT FUNCTIONS
-  /// Print the ordered Blocks.
-  void printOrderedBlocks() const {
-    size_t i = 0;
-    for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
-        iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
-      dbgs() << "BB" << (*iterBlk)->getNumber();
-      dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
-      if (i != 0 && i % 10 == 0) {
-        dbgs() << "\n";
-      } else {
-        dbgs() << " ";
-      }
-    }
-  }
-  static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
-    for (MachineLoop::iterator iter = LoopInfo.begin(),
-         iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
-      (*iter)->print(dbgs(), 0);
-    }
-  }
-
-  // UTILITY FUNCTIONS
-  int getSCCNum(MachineBasicBlock *MBB) const;
-  MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
-  bool hasBackEdge(MachineBasicBlock *MBB) const;
-  static unsigned getLoopDepth(MachineLoop *LoopRep);
-  bool isRetiredBlock(MachineBasicBlock *MBB) const;
-  bool isActiveLoophead(MachineBasicBlock *MBB) const;
-  PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
-      bool AllowSideEntry = true) const;
-  int countActiveBlock(MBBVector::const_iterator It,
-      MBBVector::const_iterator E) const;
-  bool needMigrateBlock(MachineBasicBlock *MBB) const;
-
-  // Utility Functions
-  void reversePredicateSetter(MachineBasicBlock::iterator I);
-  /// Compute the reversed DFS post order of Blocks
-  void orderBlocks(MachineFunction *MF);
-
-  // Function originally from CFGStructTraits
-  void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
-      DebugLoc DL = DebugLoc());
-  MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
-    DebugLoc DL = DebugLoc());
-  MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
-  void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
-      DebugLoc DL);
-  void insertCondBranchBefore(MachineBasicBlock *MBB,
-      MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
-      DebugLoc DL);
-  void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum);
-  static int getBranchNzeroOpcode(int OldOpcode);
-  static int getBranchZeroOpcode(int OldOpcode);
-  static int getContinueNzeroOpcode(int OldOpcode);
-  static int getContinueZeroOpcode(int OldOpcode);
-  static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
-  static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
-  static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
-      MachineInstr *MI);
-  static bool isCondBranch(MachineInstr *MI);
-  static bool isUncondBranch(MachineInstr *MI);
-  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
-  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
-  /// The correct naming for this is getPossibleLoopendBlockBranchInstr.
-  ///
-  /// BB with backward-edge could have move instructions after the branch
-  /// instruction.  Such move instruction "belong to" the loop backward-edge.
-  MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
-  static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
-  static MachineInstr *getContinueInstr(MachineBasicBlock *MBB);
-  static bool isReturnBlock(MachineBasicBlock *MBB);
-  static void cloneSuccessorList(MachineBasicBlock *DstMBB,
-      MachineBasicBlock *SrcMBB) ;
-  static MachineBasicBlock *clone(MachineBasicBlock *MBB);
-  /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
-  /// because the AMDGPU instruction is not recognized as terminator fix this
-  /// and retire this routine
-  void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
-      MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
-  static void wrapup(MachineBasicBlock *MBB);
-
-
-  int patternMatch(MachineBasicBlock *MBB);
-  int patternMatchGroup(MachineBasicBlock *MBB);
-  int serialPatternMatch(MachineBasicBlock *MBB);
-  int ifPatternMatch(MachineBasicBlock *MBB);
-  int loopendPatternMatch();
-  int mergeLoop(MachineLoop *LoopRep);
-  int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader);
-
-  void handleLoopcontBlock(MachineBasicBlock *ContingMBB,
-      MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
-      MachineLoop *ContLoop);
-  /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
-  /// the same loop with LoopLandInfo without explicitly keeping track of
-  /// loopContBlks and loopBreakBlks, this is a method to get the information.
-  bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
-      MachineBasicBlock *Src2MBB);
-  int handleJumpintoIf(MachineBasicBlock *HeadMBB,
-      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
-  int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
-      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
-  int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
-      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
-      MachineBasicBlock **LandMBBPtr);
-  void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
-      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
-      MachineBasicBlock *LandMBB, bool Detail = false);
-  int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
-      MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
-  void mergeSerialBlock(MachineBasicBlock *DstMBB,
-      MachineBasicBlock *SrcMBB);
-
-  void mergeIfthenelseBlock(MachineInstr *BranchMI,
-      MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
-      MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
-  void mergeLooplandBlock(MachineBasicBlock *DstMBB,
-      MachineBasicBlock *LandMBB);
-  void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
-      MachineBasicBlock *LandMBB);
-  void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
-      MachineBasicBlock *ContMBB);
-  /// normalizeInfiniteLoopExit change
-  ///   B1:
-  ///        uncond_br LoopHeader
-  ///
-  /// to
-  ///   B1:
-  ///        cond_br 1 LoopHeader dummyExit
-  /// and return the newly added dummy exit block
-  MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
-  void removeUnconditionalBranch(MachineBasicBlock *MBB);
-  /// Remove duplicate branches instructions in a block.
-  /// For instance
-  /// B0:
-  ///    cond_br X B1 B2
-  ///    cond_br X B1 B2
-  /// is transformed to
-  /// B0:
-  ///    cond_br X B1 B2
-  void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
-  void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
-  void removeSuccessor(MachineBasicBlock *MBB);
-  MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
-      MachineBasicBlock *PredMBB);
-  void migrateInstruction(MachineBasicBlock *SrcMBB,
-      MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
-  void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
-  void retireBlock(MachineBasicBlock *MBB);
-  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr);
-
-  MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
-  /// This is work around solution for findNearestCommonDominator not available
-  /// to post dom a proper fix should go to Dominators.h.
-  MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1,
-      MachineBasicBlock *MBB2);
-
-private:
-  MBBInfoMap BlockInfoMap;
-  LoopLandInfoMap LLInfoMap;
-  std::map<MachineLoop *, bool> Visited;
-  MachineFunction *FuncRep;
-  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
-};
-
-int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
-  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
-  if (It == BlockInfoMap.end())
-    return INVALIDSCCNUM;
-  return (*It).second->SccNum;
-}
-
-MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
-    const {
-  LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
-  if (It == LLInfoMap.end())
-    return nullptr;
-  return (*It).second;
-}
-
-bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
-  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
-  if (!LoopRep)
-    return false;
-  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
-  return MBB->isSuccessor(LoopHeader);
-}
-
-unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) {
-  return LoopRep ? LoopRep->getLoopDepth() : 0;
-}
-
-bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
-  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
-  if (It == BlockInfoMap.end())
-    return false;
-  return (*It).second->IsRetired;
-}
-
-bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
-  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
-  while (LoopRep && LoopRep->getHeader() == MBB) {
-    MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
-    if(!LoopLand)
-      return true;
-    if (!isRetiredBlock(LoopLand))
-      return true;
-    LoopRep = LoopRep->getParentLoop();
-  }
-  return false;
-}
-AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
-    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
-    bool AllowSideEntry) const {
-  assert(DstMBB);
-  if (SrcMBB == DstMBB)
-    return SinglePath_InPath;
-  while (SrcMBB && SrcMBB->succ_size() == 1) {
-    SrcMBB = *SrcMBB->succ_begin();
-    if (SrcMBB == DstMBB)
-      return SinglePath_InPath;
-    if (!AllowSideEntry && SrcMBB->pred_size() > 1)
-      return Not_SinglePath;
-  }
-  if (SrcMBB && SrcMBB->succ_size()==0)
-    return SinglePath_NotInPath;
-  return Not_SinglePath;
-}
-
-int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
-    MBBVector::const_iterator E) const {
-  int Count = 0;
-  while (It != E) {
-    if (!isRetiredBlock(*It))
-      ++Count;
-    ++It;
-  }
-  return Count;
-}
-
-bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
-  unsigned BlockSizeThreshold = 30;
-  unsigned CloneInstrThreshold = 100;
-  bool MultiplePreds = MBB && (MBB->pred_size() > 1);
-
-  if(!MultiplePreds)
-    return false;
-  unsigned BlkSize = MBB->size();
-  return ((BlkSize > BlockSizeThreshold) &&
-      (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
-}
-
-void AMDGPUCFGStructurizer::reversePredicateSetter(
-    MachineBasicBlock::iterator I) {
-  while (I--) {
-    if (I->getOpcode() == AMDGPU::PRED_X) {
-      switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
-      case OPCODE_IS_ZERO_INT:
-        static_cast<MachineInstr *>(I)->getOperand(2)
-            .setImm(OPCODE_IS_NOT_ZERO_INT);
-        return;
-      case OPCODE_IS_NOT_ZERO_INT:
-        static_cast<MachineInstr *>(I)->getOperand(2)
-            .setImm(OPCODE_IS_ZERO_INT);
-        return;
-      case OPCODE_IS_ZERO:
-        static_cast<MachineInstr *>(I)->getOperand(2)
-            .setImm(OPCODE_IS_NOT_ZERO);
-        return;
-      case OPCODE_IS_NOT_ZERO:
-        static_cast<MachineInstr *>(I)->getOperand(2)
-            .setImm(OPCODE_IS_ZERO);
-        return;
-      default:
-        llvm_unreachable("PRED_X Opcode invalid!");
-      }
-    }
-  }
-}
-
-void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
-    int NewOpcode, DebugLoc DL) {
- MachineInstr *MI = MBB->getParent()
-    ->CreateMachineInstr(TII->get(NewOpcode), DL);
-  MBB->push_back(MI);
-  //assume the instruction doesn't take any reg operand ...
-  SHOWNEWINSTR(MI);
-}
-
-MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
-    int NewOpcode, DebugLoc DL) {
-  MachineInstr *MI =
-      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
-  if (MBB->begin() != MBB->end())
-    MBB->insert(MBB->begin(), MI);
-  else
-    MBB->push_back(MI);
-  SHOWNEWINSTR(MI);
-  return MI;
-}
-
-MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
-    MachineBasicBlock::iterator I, int NewOpcode) {
-  MachineInstr *OldMI = &(*I);
-  MachineBasicBlock *MBB = OldMI->getParent();
-  MachineInstr *NewMBB =
-      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
-  MBB->insert(I, NewMBB);
-  //assume the instruction doesn't take any reg operand ...
-  SHOWNEWINSTR(NewMBB);
-  return NewMBB;
-}
-
-void AMDGPUCFGStructurizer::insertCondBranchBefore(
-    MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) {
-  MachineInstr *OldMI = &(*I);
-  MachineBasicBlock *MBB = OldMI->getParent();
-  MachineFunction *MF = MBB->getParent();
-  MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
-  MBB->insert(I, NewMI);
-  MachineInstrBuilder MIB(*MF, NewMI);
-  MIB.addReg(OldMI->getOperand(1).getReg(), false);
-  SHOWNEWINSTR(NewMI);
-  //erase later oldInstr->eraseFromParent();
-}
-
-void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk,
-    MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
-    DebugLoc DL) {
-  MachineFunction *MF = blk->getParent();
-  MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
-  //insert before
-  blk->insert(I, NewInstr);
-  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
-  SHOWNEWINSTR(NewInstr);
-}
-
-void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB,
-    int NewOpcode, int RegNum) {
-  MachineFunction *MF = MBB->getParent();
-  MachineInstr *NewInstr =
-    MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
-  MBB->push_back(NewInstr);
-  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
-  SHOWNEWINSTR(NewInstr);
-}
-
-int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
-  switch(OldOpcode) {
-  case AMDGPU::JUMP_COND:
-  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
-  case AMDGPU::BRANCH_COND_i32:
-  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
-  default: llvm_unreachable("internal error");
-  }
-  return -1;
-}
-
-int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
-  switch(OldOpcode) {
-  case AMDGPU::JUMP_COND:
-  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
-  case AMDGPU::BRANCH_COND_i32:
-  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
-  default: llvm_unreachable("internal error");
-  }
-  return -1;
-}
-
-int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
-  switch(OldOpcode) {
-  case AMDGPU::JUMP_COND:
-  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
-  default: llvm_unreachable("internal error");
-  };
-  return -1;
-}
-
-int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
-  switch(OldOpcode) {
-  case AMDGPU::JUMP_COND:
-  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
-  default: llvm_unreachable("internal error");
-  }
-  return -1;
-}
-
-MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
-  return MI->getOperand(0).getMBB();
-}
-
-void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
-    MachineBasicBlock *MBB) {
-  MI->getOperand(0).setMBB(MBB);
-}
-
-MachineBasicBlock *
-AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
-    MachineInstr *MI) {
-  assert(MBB->succ_size() == 2);
-  MachineBasicBlock *TrueBranch = getTrueBranch(MI);
-  MachineBasicBlock::succ_iterator It = MBB->succ_begin();
-  MachineBasicBlock::succ_iterator Next = It;
-  ++Next;
-  return (*It == TrueBranch) ? *Next : *It;
-}
-
-bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::BRANCH_COND_i32:
-    case AMDGPU::BRANCH_COND_f32: return true;
-  default:
-    return false;
-  }
-  return false;
-}
-
-bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
-  case AMDGPU::JUMP:
-  case AMDGPU::BRANCH:
-    return true;
-  default:
-    return false;
-  }
-  return false;
-}
-
-DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
-  //get DebugLoc from the first MachineBasicBlock instruction with debug info
-  DebugLoc DL;
-  for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end();
-      ++It) {
-    MachineInstr *instr = &(*It);
-    if (instr->getDebugLoc())
-      DL = instr->getDebugLoc();
-  }
-  return DL;
-}
-
-MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
-    MachineBasicBlock *MBB) {
-  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
-  MachineInstr *MI = &*It;
-  if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
-    return MI;
-  return nullptr;
-}
-
-MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
-    MachineBasicBlock *MBB) {
-  for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
-      It != E; ++It) {
-    // FIXME: Simplify
-    MachineInstr *MI = &*It;
-    if (MI) {
-      if (isCondBranch(MI) || isUncondBranch(MI))
-        return MI;
-      else if (!TII->isMov(MI->getOpcode()))
-        break;
-    }
-  }
-  return nullptr;
-}
-
-MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
-  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
-  if (It != MBB->rend()) {
-    MachineInstr *instr = &(*It);
-    if (instr->getOpcode() == AMDGPU::RETURN)
-      return instr;
-  }
-  return nullptr;
-}
-
-MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
-  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
-  if (It != MBB->rend()) {
-    MachineInstr *MI = &(*It);
-    if (MI->getOpcode() == AMDGPU::CONTINUE)
-      return MI;
-  }
-  return nullptr;
-}
-
-bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
-  MachineInstr *MI = getReturnInstr(MBB);
-  bool IsReturn = (MBB->succ_size() == 0);
-  if (MI)
-    assert(IsReturn);
-  else if (IsReturn)
-    DEBUG(
-      dbgs() << "BB" << MBB->getNumber()
-             <<" is return block without RETURN instr\n";);
-  return  IsReturn;
-}
-
-void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
-    MachineBasicBlock *SrcMBB) {
-  for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(),
-       iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It)
-    DstMBB->addSuccessor(*It);  // *iter's predecessor is also taken care of
-}
-
-MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
-  MachineFunction *Func = MBB->getParent();
-  MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
-  Func->push_back(NewMBB);  //insert to function
-  for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end();
-      It != E; ++It) {
-    MachineInstr *MI = Func->CloneMachineInstr(It);
-    NewMBB->push_back(MI);
-  }
-  return NewMBB;
-}
-
-void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
-    MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
-    MachineBasicBlock *NewBlk) {
-  MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
-  if (BranchMI && isCondBranch(BranchMI) &&
-      getTrueBranch(BranchMI) == OldMBB)
-    setTrueBranch(BranchMI, NewBlk);
-}
-
-void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
-  assert((!MBB->getParent()->getJumpTableInfo()
-          || MBB->getParent()->getJumpTableInfo()->isEmpty())
-         && "found a jump table");
-
-   //collect continue right before endloop
-   SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
-   MachineBasicBlock::iterator Pre = MBB->begin();
-   MachineBasicBlock::iterator E = MBB->end();
-   MachineBasicBlock::iterator It = Pre;
-   while (It != E) {
-     if (Pre->getOpcode() == AMDGPU::CONTINUE
-         && It->getOpcode() == AMDGPU::ENDLOOP)
-       ContInstr.push_back(Pre);
-     Pre = It;
-     ++It;
-   }
-
-   //delete continue right before endloop
-   for (unsigned i = 0; i < ContInstr.size(); ++i)
-      ContInstr[i]->eraseFromParent();
-
-   // TODO to fix up jump table so later phase won't be confused.  if
-   // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
-   // there isn't such an interface yet.  alternatively, replace all the other
-   // blocks in the jump table with the entryBlk //}
-
-}
-
-
-bool AMDGPUCFGStructurizer::prepare() {
-  bool Changed = false;
-
-  //FIXME: if not reducible flow graph, make it so ???
-
-  DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
-
-  orderBlocks(FuncRep);
-
-  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
-
-  // Add an ExitBlk to loop that don't have one
-  for (MachineLoopInfo::iterator It = MLI->begin(),
-       E = MLI->end(); It != E; ++It) {
-    MachineLoop *LoopRep = (*It);
-    MBBVector ExitingMBBs;
-    LoopRep->getExitingBlocks(ExitingMBBs);
-
-    if (ExitingMBBs.size() == 0) {
-      MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
-      if (DummyExitBlk)
-        RetBlks.push_back(DummyExitBlk);
-    }
-  }
-
-  // Remove unconditional branch instr.
-  // Add dummy exit block iff there are multiple returns.
-  for (SmallVectorImpl<MachineBasicBlock *>::const_iterator
-       It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) {
-    MachineBasicBlock *MBB = *It;
-    removeUnconditionalBranch(MBB);
-    removeRedundantConditionalBranch(MBB);
-    if (isReturnBlock(MBB)) {
-      RetBlks.push_back(MBB);
-    }
-    assert(MBB->succ_size() <= 2);
-  }
-
-  if (RetBlks.size() >= 2) {
-    addDummyExitBlock(RetBlks);
-    Changed = true;
-  }
-
-  return Changed;
-}
-
-bool AMDGPUCFGStructurizer::run() {
-
-  //Assume reducible CFG...
-  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
-
-#ifdef STRESSTEST
-  //Use the worse block ordering to test the algorithm.
-  ReverseVector(orderedBlks);
-#endif
-
-  DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
-  int NumIter = 0;
-  bool Finish = false;
-  MachineBasicBlock *MBB;
-  bool MakeProgress = false;
-  int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
-                                        OrderedBlks.end());
-
-  do {
-    ++NumIter;
-    DEBUG(
-      dbgs() << "numIter = " << NumIter
-             << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
-    );
-
-    SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
-        OrderedBlks.begin();
-    SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
-        OrderedBlks.end();
-
-    SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
-        It;
-    MachineBasicBlock *SccBeginMBB = nullptr;
-    int SccNumBlk = 0;  // The number of active blocks, init to a
-                        // maximum possible number.
-    int SccNumIter;     // Number of iteration in this SCC.
-
-    while (It != E) {
-      MBB = *It;
-
-      if (!SccBeginMBB) {
-        SccBeginIter = It;
-        SccBeginMBB = MBB;
-        SccNumIter = 0;
-        SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
-        DEBUG(
-              dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
-              dbgs() << "\n";
-        );
-      }
-
-      if (!isRetiredBlock(MBB))
-        patternMatch(MBB);
-
-      ++It;
-
-      bool ContNextScc = true;
-      if (It == E
-          || getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
-        // Just finish one scc.
-        ++SccNumIter;
-        int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
-        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
-          DEBUG(
-            dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
-                   << ", sccNumIter = " << SccNumIter;
-            dbgs() << "doesn't make any progress\n";
-          );
-          ContNextScc = true;
-        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
-          SccNumBlk = sccRemainedNumBlk;
-          It = SccBeginIter;
-          ContNextScc = false;
-          DEBUG(
-            dbgs() << "repeat processing SCC" << getSCCNum(MBB)
-                   << "sccNumIter = " << SccNumIter << '\n';
-          );
-        } else {
-          // Finish the current scc.
-          ContNextScc = true;
-        }
-      } else {
-        // Continue on next component in the current scc.
-        ContNextScc = false;
-      }
-
-      if (ContNextScc)
-        SccBeginMBB = nullptr;
-    } //while, "one iteration" over the function.
-
-    MachineBasicBlock *EntryMBB =
-        GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
-    if (EntryMBB->succ_size() == 0) {
-      Finish = true;
-      DEBUG(
-        dbgs() << "Reduce to one block\n";
-      );
-    } else {
-      int NewnumRemainedBlk
-        = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
-      // consider cloned blocks ??
-      if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
-        MakeProgress = true;
-        NumRemainedBlk = NewnumRemainedBlk;
-      } else {
-        MakeProgress = false;
-        DEBUG(
-          dbgs() << "No progress\n";
-        );
-      }
-    }
-  } while (!Finish && MakeProgress);
-
-  // Misc wrap up to maintain the consistency of the Function representation.
-  wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
-
-  // Detach retired Block, release memory.
-  for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
-      It != E; ++It) {
-    if ((*It).second && (*It).second->IsRetired) {
-      assert(((*It).first)->getNumber() != -1);
-      DEBUG(
-        dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
-      );
-      (*It).first->eraseFromParent();  //Remove from the parent Function.
-    }
-    delete (*It).second;
-  }
-  BlockInfoMap.clear();
-  LLInfoMap.clear();
-
-  if (!Finish) {
-    DEBUG(FuncRep->viewCFG());
-    llvm_unreachable("IRREDUCIBLE_CFG");
-  }
-
-  return true;
-}
-
-
-
-void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
-  int SccNum = 0;
-  MachineBasicBlock *MBB;
-  for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
-       ++It, ++SccNum) {
-    const std::vector<MachineBasicBlock *> &SccNext = *It;
-    for (std::vector<MachineBasicBlock *>::const_iterator
-         blockIter = SccNext.begin(), blockEnd = SccNext.end();
-         blockIter != blockEnd; ++blockIter) {
-      MBB = *blockIter;
-      OrderedBlks.push_back(MBB);
-      recordSccnum(MBB, SccNum);
-    }
-  }
-
-  //walk through all the block in func to check for unreachable
-  typedef GraphTraits<MachineFunction *> GTM;
-  MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
-  for (; It != E; ++It) {
-    MachineBasicBlock *MBB = &(*It);
-    SccNum = getSCCNum(MBB);
-    if (SccNum == INVALIDSCCNUM)
-      dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
-  }
-}
-
-int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
-  int NumMatch = 0;
-  int CurMatch;
-
-  DEBUG(
-        dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
-  );
-
-  while ((CurMatch = patternMatchGroup(MBB)) > 0)
-    NumMatch += CurMatch;
-
-  DEBUG(
-        dbgs() << "End patternMatch BB" << MBB->getNumber()
-      << ", numMatch = " << NumMatch << "\n";
-  );
-
-  return NumMatch;
-}
-
-int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
-  int NumMatch = 0;
-  NumMatch += loopendPatternMatch();
-  NumMatch += serialPatternMatch(MBB);
-  NumMatch += ifPatternMatch(MBB);
-  return NumMatch;
-}
-
-
-int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
-  if (MBB->succ_size() != 1)
-    return 0;
-
-  MachineBasicBlock *childBlk = *MBB->succ_begin();
-  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
-    return 0;
-
-  mergeSerialBlock(MBB, childBlk);
-  ++numSerialPatternMatch;
-  return 1;
-}
-
-int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
-  //two edges
-  if (MBB->succ_size() != 2)
-    return 0;
-  if (hasBackEdge(MBB))
-    return 0;
-  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
-  if (!BranchMI)
-    return 0;
-
-  assert(isCondBranch(BranchMI));
-  int NumMatch = 0;
-
-  MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
-  NumMatch += serialPatternMatch(TrueMBB);
-  NumMatch += ifPatternMatch(TrueMBB);
-  MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
-  NumMatch += serialPatternMatch(FalseMBB);
-  NumMatch += ifPatternMatch(FalseMBB);
-  MachineBasicBlock *LandBlk;
-  int Cloned = 0;
-
-  assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
-  // TODO: Simplify
-  if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
-    && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
-    // Diamond pattern
-    LandBlk = *TrueMBB->succ_begin();
-  } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
-    // Triangle pattern, false is empty
-    LandBlk = FalseMBB;
-    FalseMBB = nullptr;
-  } else if (FalseMBB->succ_size() == 1
-             && *FalseMBB->succ_begin() == TrueMBB) {
-    // Triangle pattern, true is empty
-    // We reverse the predicate to make a triangle, empty false pattern;
-    std::swap(TrueMBB, FalseMBB);
-    reversePredicateSetter(MBB->end());
-    LandBlk = FalseMBB;
-    FalseMBB = nullptr;
-  } else if (FalseMBB->succ_size() == 1
-             && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
-    LandBlk = *FalseMBB->succ_begin();
-  } else if (TrueMBB->succ_size() == 1
-    && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
-    LandBlk = *TrueMBB->succ_begin();
-  } else {
-    return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
-  }
-
-  // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
-  // new BB created for landBlk==NULL may introduce new challenge to the
-  // reduction process.
-  if (LandBlk &&
-      ((TrueMBB && TrueMBB->pred_size() > 1)
-      || (FalseMBB && FalseMBB->pred_size() > 1))) {
-     Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
-  }
-
-  if (TrueMBB && TrueMBB->pred_size() > 1) {
-    TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
-    ++Cloned;
-  }
-
-  if (FalseMBB && FalseMBB->pred_size() > 1) {
-    FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
-    ++Cloned;
-  }
-
-  mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
-
-  ++numIfPatternMatch;
-
-  numClonedBlock += Cloned;
-
-  return 1 + Cloned + NumMatch;
-}
-
-int AMDGPUCFGStructurizer::loopendPatternMatch() {
-  std::deque<MachineLoop *> NestedLoops;
-  for (auto &It: *MLI)
-    for (MachineLoop *ML : depth_first(It))
-      NestedLoops.push_front(ML);
-
-  if (NestedLoops.size() == 0)
-    return 0;
-
-  // Process nested loop outside->inside (we did push_front),
-  // so "continue" to a outside loop won't be mistaken as "break"
-  // of the current loop.
-  int Num = 0;
-  for (MachineLoop *ExaminedLoop : NestedLoops) {
-    if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
-      continue;
-    DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
-    int NumBreak = mergeLoop(ExaminedLoop);
-    if (NumBreak == -1)
-      break;
-    Num += NumBreak;
-  }
-  return Num;
-}
-
-int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
-  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
-  MBBVector ExitingMBBs;
-  LoopRep->getExitingBlocks(ExitingMBBs);
-  assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
-  DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
-  // We assume a single ExitBlk
-  MBBVector ExitBlks;
-  LoopRep->getExitBlocks(ExitBlks);
-  SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
-  for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
-    ExitBlkSet.insert(ExitBlks[i]);
-  assert(ExitBlkSet.size() == 1);
-  MachineBasicBlock *ExitBlk = *ExitBlks.begin();
-  assert(ExitBlk && "Loop has several exit block");
-  MBBVector LatchBlks;
-  typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits;
-  InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader),
-      PE = InvMBBTraits::child_end(LoopHeader);
-  for (; PI != PE; PI++) {
-    if (LoopRep->contains(*PI))
-      LatchBlks.push_back(*PI);
-  }
-
-  for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
-    mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
-  for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
-    settleLoopcontBlock(LatchBlks[i], LoopHeader);
-  int Match = 0;
-  do {
-    Match = 0;
-    Match += serialPatternMatch(LoopHeader);
-    Match += ifPatternMatch(LoopHeader);
-  } while (Match > 0);
-  mergeLooplandBlock(LoopHeader, ExitBlk);
-  MachineLoop *ParentLoop = LoopRep->getParentLoop();
-  if (ParentLoop)
-    MLI->changeLoopFor(LoopHeader, ParentLoop);
-  else
-    MLI->removeBlock(LoopHeader);
-  Visited[LoopRep] = true;
-  return 1;
-}
-
-int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep,
-    MachineBasicBlock *LoopHeader) {
-  int NumCont = 0;
-  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> ContMBB;
-  typedef GraphTraits<Inverse<MachineBasicBlock *> > GTIM;
-  GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader),
-      E = GTIM::child_end(LoopHeader);
-  for (; It != E; ++It) {
-    MachineBasicBlock *MBB = *It;
-    if (LoopRep->contains(MBB)) {
-      handleLoopcontBlock(MBB, MLI->getLoopFor(MBB),
-                          LoopHeader, LoopRep);
-      ContMBB.push_back(MBB);
-      ++NumCont;
-    }
-  }
-
-  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(),
-      E = ContMBB.end(); It != E; ++It) {
-    (*It)->removeSuccessor(LoopHeader);
-  }
-
-  numLoopcontPatternMatch += NumCont;
-
-  return NumCont;
-}
-
-
-bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
-    MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
-  if (Src1MBB->succ_size() == 0) {
-    MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
-    if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
-      MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
-      if (TheEntry) {
-        DEBUG(
-          dbgs() << "isLoopContBreakBlock yes src1 = BB"
-                 << Src1MBB->getNumber()
-                 << " src2 = BB" << Src2MBB->getNumber() << "\n";
-        );
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
-    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
-  int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
-  if (Num == 0) {
-    DEBUG(
-      dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
-    );
-    Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
-  }
-  return Num;
-}
-
-int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
-    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
-  int Num = 0;
-  MachineBasicBlock *DownBlk;
-
-  //trueBlk could be the common post dominator
-  DownBlk = TrueMBB;
-
-  DEBUG(
-    dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
-           << " true = BB" << TrueMBB->getNumber()
-           << ", numSucc=" << TrueMBB->succ_size()
-           << " false = BB" << FalseMBB->getNumber() << "\n";
-  );
-
-  while (DownBlk) {
-    DEBUG(
-      dbgs() << "check down = BB" << DownBlk->getNumber();
-    );
-
-    if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
-      DEBUG(
-        dbgs() << " working\n";
-      );
-
-      Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
-      Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
-
-      numClonedBlock += Num;
-      Num += serialPatternMatch(*HeadMBB->succ_begin());
-      Num += serialPatternMatch(*std::next(HeadMBB->succ_begin()));
-      Num += ifPatternMatch(HeadMBB);
-      assert(Num > 0);
-
-      break;
-    }
-    DEBUG(
-      dbgs() << " not working\n";
-    );
-    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
-  } // walk down the postDomTree
-
-  return Num;
-}
-
-void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
-    MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
-    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
-  dbgs() << "head = BB" << HeadMBB->getNumber()
-         << " size = " << HeadMBB->size();
-  if (Detail) {
-    dbgs() << "\n";
-    HeadMBB->print(dbgs());
-    dbgs() << "\n";
-  }
-
-  if (TrueMBB) {
-    dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
-           << TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
-    if (Detail) {
-      dbgs() << "\n";
-      TrueMBB->print(dbgs());
-      dbgs() << "\n";
-    }
-  }
-  if (FalseMBB) {
-    dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
-           << FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
-    if (Detail) {
-      dbgs() << "\n";
-      FalseMBB->print(dbgs());
-      dbgs() << "\n";
-    }
-  }
-  if (LandMBB) {
-    dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
-           << LandMBB->size() << " numPred = " << LandMBB->pred_size();
-    if (Detail) {
-      dbgs() << "\n";
-      LandMBB->print(dbgs());
-      dbgs() << "\n";
-    }
-  }
-
-    dbgs() << "\n";
-}
-
-int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
-    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
-    MachineBasicBlock **LandMBBPtr) {
-  bool MigrateTrue = false;
-  bool MigrateFalse = false;
-
-  MachineBasicBlock *LandBlk = *LandMBBPtr;
-
-  assert((!TrueMBB || TrueMBB->succ_size() <= 1)
-         && (!FalseMBB || FalseMBB->succ_size() <= 1));
-
-  if (TrueMBB == FalseMBB)
-    return 0;
-
-  MigrateTrue = needMigrateBlock(TrueMBB);
-  MigrateFalse = needMigrateBlock(FalseMBB);
-
-  if (!MigrateTrue && !MigrateFalse)
-    return 0;
-
-  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
-  // have more than one predecessors.  without doing this, its predecessor
-  // rather than headBlk will have undefined value in initReg.
-  if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
-    MigrateTrue = true;
-  if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
-    MigrateFalse = true;
-
-  DEBUG(
-    dbgs() << "before improveSimpleJumpintoIf: ";
-    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
-  );
-
-  // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
-  //
-  // new: headBlk => if () {initReg = 1; org trueBlk branch} else
-  //      {initReg = 0; org falseBlk branch }
-  //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
-  //      => org landBlk
-  //      if landBlk->pred_size() > 2, put the about if-else inside
-  //      if (initReg !=2) {...}
-  //
-  // add initReg = initVal to headBlk
-
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  if (!MigrateTrue || !MigrateFalse) {
-    // XXX: We have an opportunity here to optimize the "branch into if" case
-    // here.  Branch into if looks like this:
-    //                        entry
-    //                       /     |
-    //           diamond_head       branch_from
-    //             /      \           |
-    // diamond_false        diamond_true
-    //             \      /
-    //               done
-    //
-    // The diamond_head block begins the "if" and the diamond_true block
-    // is the block being "branched into".
-    //
-    // If MigrateTrue is true, then TrueBB is the block being "branched into"
-    // and if MigrateFalse is true, then FalseBB is the block being
-    // "branched into"
-    // 
-    // Here is the pseudo code for how I think the optimization should work:
-    // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
-    // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
-    // 3. Move the branch instruction from diamond_head into its own basic
-    //    block (new_block).
-    // 4. Add an unconditional branch from diamond_head to new_block
-    // 5. Replace the branch instruction in branch_from with an unconditional
-    //    branch to new_block.  If branch_from has multiple predecessors, then
-    //    we need to replace the True/False block in the branch
-    //    instruction instead of replacing it.
-    // 6. Change the condition of the branch instruction in new_block from
-    //    COND to (COND || GPR0)
-    //
-    // In order insert these MOV instruction, we will need to use the
-    // RegisterScavenger.  Usually liveness stops being tracked during
-    // the late machine optimization passes, however if we implement
-    // bool TargetRegisterInfo::requiresRegisterScavenging(
-    //                                                const MachineFunction &MF)
-    // and have it return true, liveness will be tracked correctly 
-    // by generic optimization passes.  We will also need to make sure that
-    // all of our target-specific passes that run after regalloc and before
-    // the CFGStructurizer track liveness and we will need to modify this pass
-    // to correctly track liveness.
-    //
-    // After the above changes, the new CFG should look like this:
-    //                        entry
-    //                       /     |
-    //           diamond_head       branch_from
-    //                       \     /
-    //                      new_block
-    //                      /      |
-    //         diamond_false        diamond_true
-    //                      \      /
-    //                        done
-    //
-    // Without this optimization, we are forced to duplicate the diamond_true
-    // block and we will end up with a CFG like this:
-    //
-    //                        entry
-    //                       /     |
-    //           diamond_head       branch_from
-    //             /      \                   |
-    // diamond_false        diamond_true      diamond_true (duplicate)
-    //             \      /                   |
-    //               done --------------------|
-    //
-    // Duplicating diamond_true can be very costly especially if it has a
-    // lot of instructions.
-    return 0;
-  }
-
-  int NumNewBlk = 0;
-
-  bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
-
-  //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
-  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
-
-  if (LandBlkHasOtherPred) {
-    llvm_unreachable("Extra register needed to handle CFG");
-    unsigned CmpResReg =
-      HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
-    llvm_unreachable("Extra compare instruction needed to handle CFG");
-    insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
-        CmpResReg, DebugLoc());
-  }
-
-  // XXX: We are running this after RA, so creating virtual registers will
-  // cause an assertion failure in the PostRA scheduling pass.
-  unsigned InitReg =
-    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
-  insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
-      DebugLoc());
-
-  if (MigrateTrue) {
-    migrateInstruction(TrueMBB, LandBlk, I);
-    // need to uncondionally insert the assignment to ensure a path from its
-    // predecessor rather than headBlk has valid value in initReg if
-    // (initVal != 1).
-    llvm_unreachable("Extra register needed to handle CFG");
-  }
-  insertInstrBefore(I, AMDGPU::ELSE);
-
-  if (MigrateFalse) {
-    migrateInstruction(FalseMBB, LandBlk, I);
-    // need to uncondionally insert the assignment to ensure a path from its
-    // predecessor rather than headBlk has valid value in initReg if
-    // (initVal != 0)
-    llvm_unreachable("Extra register needed to handle CFG");
-  }
-
-  if (LandBlkHasOtherPred) {
-    // add endif
-    insertInstrBefore(I, AMDGPU::ENDIF);
-
-    // put initReg = 2 to other predecessors of landBlk
-    for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
-         PE = LandBlk->pred_end(); PI != PE; ++PI) {
-      MachineBasicBlock *MBB = *PI;
-      if (MBB != TrueMBB && MBB != FalseMBB)
-        llvm_unreachable("Extra register needed to handle CFG");
-    }
-  }
-  DEBUG(
-    dbgs() << "result from improveSimpleJumpintoIf: ";
-    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
-  );
-
-  // update landBlk
-  *LandMBBPtr = LandBlk;
-
-  return NumNewBlk;
-}
-
-void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB,
-    MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
-    MachineLoop *ContLoop) {
-  DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber()
-               << " header = BB" << ContMBB->getNumber() << "\n";
-        dbgs() << "Trying to continue loop-depth = "
-               << getLoopDepth(ContLoop)
-               << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";);
-  settleLoopcontBlock(ContingMBB, ContMBB);
-}
-
-void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
-    MachineBasicBlock *SrcMBB) {
-  DEBUG(
-    dbgs() << "serialPattern BB" << DstMBB->getNumber()
-           << " <= BB" << SrcMBB->getNumber() << "\n";
-  );
-  DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
-
-  DstMBB->removeSuccessor(SrcMBB);
-  cloneSuccessorList(DstMBB, SrcMBB);
-
-  removeSuccessor(SrcMBB);
-  MLI->removeBlock(SrcMBB);
-  retireBlock(SrcMBB);
-}
-
-void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
-    MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
-    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
-  assert (TrueMBB);
-  DEBUG(
-    dbgs() << "ifPattern BB" << MBB->getNumber();
-    dbgs() << "{  ";
-    if (TrueMBB) {
-      dbgs() << "BB" << TrueMBB->getNumber();
-    }
-    dbgs() << "  } else ";
-    dbgs() << "{  ";
-    if (FalseMBB) {
-      dbgs() << "BB" << FalseMBB->getNumber();
-    }
-    dbgs() << "  }\n ";
-    dbgs() << "landBlock: ";
-    if (!LandMBB) {
-      dbgs() << "NULL";
-    } else {
-      dbgs() << "BB" << LandMBB->getNumber();
-    }
-    dbgs() << "\n";
-  );
-
-  int OldOpcode = BranchMI->getOpcode();
-  DebugLoc BranchDL = BranchMI->getDebugLoc();
-
-//    transform to
-//    if cond
-//       trueBlk
-//    else
-//       falseBlk
-//    endif
-//    landBlk
-
-  MachineBasicBlock::iterator I = BranchMI;
-  insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
-      BranchDL);
-
-  if (TrueMBB) {
-    MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
-    MBB->removeSuccessor(TrueMBB);
-    if (LandMBB && TrueMBB->succ_size()!=0)
-      TrueMBB->removeSuccessor(LandMBB);
-    retireBlock(TrueMBB);
-    MLI->removeBlock(TrueMBB);
-  }
-
-  if (FalseMBB) {
-    insertInstrBefore(I, AMDGPU::ELSE);
-    MBB->splice(I, FalseMBB, FalseMBB->begin(),
-                   FalseMBB->end());
-    MBB->removeSuccessor(FalseMBB);
-    if (LandMBB && FalseMBB->succ_size() != 0)
-      FalseMBB->removeSuccessor(LandMBB);
-    retireBlock(FalseMBB);
-    MLI->removeBlock(FalseMBB);
-  }
-  insertInstrBefore(I, AMDGPU::ENDIF);
-
-  BranchMI->eraseFromParent();
-
-  if (LandMBB && TrueMBB && FalseMBB)
-    MBB->addSuccessor(LandMBB);
-
-}
-
-void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
-    MachineBasicBlock *LandMBB) {
-  DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
-               << " land = BB" << LandMBB->getNumber() << "\n";);
-
-  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
-  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
-  DstBlk->addSuccessor(LandMBB);
-  DstBlk->removeSuccessor(DstBlk);
-}
-
-
-void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
-    MachineBasicBlock *LandMBB) {
-  DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
-               << " land = BB" << LandMBB->getNumber() << "\n";);
-  MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
-  assert(BranchMI && isCondBranch(BranchMI));
-  DebugLoc DL = BranchMI->getDebugLoc();
-  MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
-  MachineBasicBlock::iterator I = BranchMI;
-  if (TrueBranch != LandMBB)
-    reversePredicateSetter(I);
-  insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
-  insertInstrBefore(I, AMDGPU::BREAK);
-  insertInstrBefore(I, AMDGPU::ENDIF);
-  //now branchInst can be erase safely
-  BranchMI->eraseFromParent();
-  //now take care of successors, retire blocks
-  ExitingMBB->removeSuccessor(LandMBB);
-}
-
-void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
-    MachineBasicBlock *ContMBB) {
-  DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
-               << ContingMBB->getNumber()
-               << ", cont = BB" << ContMBB->getNumber() << "\n";);
-
-  MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
-  if (MI) {
-    assert(isCondBranch(MI));
-    MachineBasicBlock::iterator I = MI;
-    MachineBasicBlock *TrueBranch = getTrueBranch(MI);
-    int OldOpcode = MI->getOpcode();
-    DebugLoc DL = MI->getDebugLoc();
-
-    bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
-
-    if (!UseContinueLogical) {
-      int BranchOpcode =
-          TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
-          getBranchZeroOpcode(OldOpcode);
-      insertCondBranchBefore(I, BranchOpcode, DL);
-      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-      insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
-      insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
-    } else {
-      int BranchOpcode =
-          TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
-          getContinueZeroOpcode(OldOpcode);
-      insertCondBranchBefore(I, BranchOpcode, DL);
-    }
-
-    MI->eraseFromParent();
-  } else {
-    // if we've arrived here then we've already erased the branch instruction
-    // travel back up the basic block to see the last reference of our debug
-    // location we've just inserted that reference here so it should be
-    // representative insertEnd to ensure phi-moves, if exist, go before the
-    // continue-instr.
-    insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
-        getLastDebugLocInBB(ContingMBB));
-  }
-}
-
-int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
-    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
-  int Cloned = 0;
-  assert(PreMBB->isSuccessor(SrcMBB));
-  while (SrcMBB && SrcMBB != DstMBB) {
-    assert(SrcMBB->succ_size() == 1);
-    if (SrcMBB->pred_size() > 1) {
-      SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
-      ++Cloned;
-    }
-
-    PreMBB = SrcMBB;
-    SrcMBB = *SrcMBB->succ_begin();
-  }
-
-  return Cloned;
-}
-
-MachineBasicBlock *
-AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
-    MachineBasicBlock *PredMBB) {
-  assert(PredMBB->isSuccessor(MBB) &&
-         "succBlk is not a prececessor of curBlk");
-
-  MachineBasicBlock *CloneMBB = clone(MBB);  //clone instructions
-  replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
-  //srcBlk, oldBlk, newBlk
-
-  PredMBB->removeSuccessor(MBB);
-  PredMBB->addSuccessor(CloneMBB);
-
-  // add all successor to cloneBlk
-  cloneSuccessorList(CloneMBB, MBB);
-
-  numClonedInstr += MBB->size();
-
-  DEBUG(
-    dbgs() << "Cloned block: " << "BB"
-           << MBB->getNumber() << "size " << MBB->size() << "\n";
-  );
-
-  SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
-
-  return CloneMBB;
-}
-
-void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
-    MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
-  MachineBasicBlock::iterator SpliceEnd;
-  //look for the input branchinstr, not the AMDGPU branchinstr
-  MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
-  if (!BranchMI) {
-    DEBUG(
-      dbgs() << "migrateInstruction don't see branch instr\n" ;
-    );
-    SpliceEnd = SrcMBB->end();
-  } else {
-    DEBUG(
-      dbgs() << "migrateInstruction see branch instr\n" ;
-      BranchMI->dump();
-    );
-    SpliceEnd = BranchMI;
-  }
-  DEBUG(
-    dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
-      << "srcSize = " << SrcMBB->size() << "\n";
-  );
-
-  //splice insert before insertPos
-  DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
-
-  DEBUG(
-    dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
-      << "srcSize = " << SrcMBB->size() << "\n";
-  );
-}
-
-MachineBasicBlock *
-AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
-  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
-  MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-
-  if (!LoopHeader || !LoopLatch)
-    return nullptr;
-  MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
-  // Is LoopRep an infinite loop ?
-  if (!BranchMI || !isUncondBranch(BranchMI))
-    return nullptr;
-
-  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
-  FuncRep->push_back(DummyExitBlk);  //insert to function
-  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
-  DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
-  MachineBasicBlock::iterator I = BranchMI;
-  unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC);
-  llvm_unreachable("Extra register needed to handle CFG");
-  MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32);
-  MachineInstrBuilder MIB(*FuncRep, NewMI);
-  MIB.addMBB(LoopHeader);
-  MIB.addReg(ImmReg, false);
-  SHOWNEWINSTR(NewMI);
-  BranchMI->eraseFromParent();
-  LoopLatch->addSuccessor(DummyExitBlk);
-
-  return DummyExitBlk;
-}
-
-void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
-  MachineInstr *BranchMI;
-
-  // I saw two unconditional branch in one basic block in example
-  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
-  while ((BranchMI = getLoopendBlockBranchInstr(MBB))
-          && isUncondBranch(BranchMI)) {
-    DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump(););
-    BranchMI->eraseFromParent();
-  }
-}
-
-void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
-    MachineBasicBlock *MBB) {
-  if (MBB->succ_size() != 2)
-    return;
-  MachineBasicBlock *MBB1 = *MBB->succ_begin();
-  MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin());
-  if (MBB1 != MBB2)
-    return;
-
-  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
-  assert(BranchMI && isCondBranch(BranchMI));
-  DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump(););
-  BranchMI->eraseFromParent();
-  SHOWNEWBLK(MBB1, "Removing redundant successor");
-  MBB->removeSuccessor(MBB1);
-}
-
-void AMDGPUCFGStructurizer::addDummyExitBlock(
-    SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
-  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
-  FuncRep->push_back(DummyExitBlk);  //insert to function
-  insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
-
-  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
-       E = RetMBB.end(); It != E; ++It) {
-    MachineBasicBlock *MBB = *It;
-    MachineInstr *MI = getReturnInstr(MBB);
-    if (MI)
-      MI->eraseFromParent();
-    MBB->addSuccessor(DummyExitBlk);
-    DEBUG(
-      dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
-             << " successors\n";
-    );
-  }
-  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
-}
-
-void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
-  while (MBB->succ_size())
-    MBB->removeSuccessor(*MBB->succ_begin());
-}
-
-void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
-    int SccNum) {
-  BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
-  if (!srcBlkInfo)
-    srcBlkInfo = new BlockInformation();
-  srcBlkInfo->SccNum = SccNum;
-}
-
-void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
-  DEBUG(
-        dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
-  );
-
-  BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
-
-  if (!SrcBlkInfo)
-    SrcBlkInfo = new BlockInformation();
-
-  SrcBlkInfo->IsRetired = true;
-  assert(MBB->succ_size() == 0 && MBB->pred_size() == 0
-         && "can't retire block yet");
-}
-
-void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep,
-    MachineBasicBlock *MBB) {
-  MachineBasicBlock *&TheEntry = LLInfoMap[loopRep];
-  if (!MBB) {
-    MBB = FuncRep->CreateMachineBasicBlock();
-    FuncRep->push_back(MBB);  //insert to function
-    SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: ");
-  }
-  TheEntry = MBB;
-  DEBUG(
-    dbgs() << "setLoopLandBlock loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  landing-block = BB" << MBB->getNumber() << "\n";
-  );
-}
-
-MachineBasicBlock *
-AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
-    MachineBasicBlock *MBB2) {
-
-  if (PDT->dominates(MBB1, MBB2))
-    return MBB1;
-  if (PDT->dominates(MBB2, MBB1))
-    return MBB2;
-
-  MachineDomTreeNode *Node1 = PDT->getNode(MBB1);
-  MachineDomTreeNode *Node2 = PDT->getNode(MBB2);
-
-  // Handle newly cloned node.
-  if (!Node1 && MBB1->succ_size() == 1)
-    return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2);
-  if (!Node2 && MBB2->succ_size() == 1)
-    return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
-
-  if (!Node1 || !Node2)
-    return nullptr;
-
-  Node1 = Node1->getIDom();
-  while (Node1) {
-    if (PDT->dominates(Node1, Node2))
-      return Node1->getBlock();
-    Node1 = Node1->getIDom();
-  }
-
-  return nullptr;
-}
-
-MachineBasicBlock *
-AMDGPUCFGStructurizer::findNearestCommonPostDom(
-    std::set<MachineBasicBlock *> &MBBs) {
-  MachineBasicBlock *CommonDom;
-  std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin();
-  std::set<MachineBasicBlock *>::const_iterator E = MBBs.end();
-  for (CommonDom = *It; It != E && CommonDom; ++It) {
-    MachineBasicBlock *MBB = *It;
-    if (MBB != CommonDom)
-      CommonDom = findNearestCommonPostDom(MBB, CommonDom);
-  }
-
-  DEBUG(
-    dbgs() << "Common post dominator for exit blocks is ";
-    if (CommonDom)
-          dbgs() << "BB" << CommonDom->getNumber() << "\n";
-    else
-      dbgs() << "NULL\n";
-  );
-
-  return CommonDom;
-}
-
-char AMDGPUCFGStructurizer::ID = 0;
-
-} // end anonymous namespace
-
-
-INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
-                      "AMDGPU CFG Structurizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
-                      "AMDGPU CFG Structurizer", false, false)
-
-FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
-  return new AMDGPUCFGStructurizer();
-}
diff --git a/lib/Target/R600/AMDKernelCodeT.h b/lib/Target/R600/AMDKernelCodeT.h
deleted file mode 100644
index 4d3041ff3db..00000000000
--- a/lib/Target/R600/AMDKernelCodeT.h
+++ /dev/null
@@ -1,704 +0,0 @@
-//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file AMDKernelCodeT.h
-//===----------------------------------------------------------------------===//
-
-#ifndef AMDKERNELCODET_H
-#define AMDKERNELCODET_H
-
-#include <cstddef>
-#include <cstdint>
-
-//---------------------------------------------------------------------------//
-// AMD Kernel Code, and its dependencies                                     //
-//---------------------------------------------------------------------------//
-
-typedef uint8_t hsa_powertwo8_t;
-typedef uint32_t hsa_ext_code_kind_t;
-typedef uint8_t hsa_ext_brig_profile8_t;
-typedef uint8_t hsa_ext_brig_machine_model8_t;
-typedef uint64_t hsa_ext_control_directive_present64_t;
-typedef uint16_t hsa_ext_exception_kind16_t;
-typedef uint32_t hsa_ext_code_kind32_t;
-
-typedef struct hsa_dim3_s {
-  uint32_t x;
-  uint32_t y;
-  uint32_t z;
-} hsa_dim3_t;
-
-/// The version of the amd_*_code_t struct. Minor versions must be
-/// backward compatible.
-typedef uint32_t amd_code_version32_t;
-enum amd_code_version_t {
-  AMD_CODE_VERSION_MAJOR = 0,
-  AMD_CODE_VERSION_MINOR = 1
-};
-
-/// The values used to define the number of bytes to use for the
-/// swizzle element size.
-enum amd_element_byte_size_t {
-  AMD_ELEMENT_2_BYTES = 0,
-  AMD_ELEMENT_4_BYTES = 1,
-  AMD_ELEMENT_8_BYTES = 2,
-  AMD_ELEMENT_16_BYTES = 3
-};
-
-/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
-/// COMPUTE_PGM_RSRC2 registers.
-typedef uint64_t amd_compute_pgm_resource_register64_t;
-
-/// Every amd_*_code_t has the following properties, which are composed of
-/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*),
-/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount
-/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0.
-///
-/// (Note that bit fields cannot be used as their layout is
-/// implementation defined in the C standard and so cannot be used to
-/// specify an ABI)
-typedef uint32_t amd_code_property32_t;
-enum amd_code_property_mask_t {
-
-  /// Enable the setup of the SGPR user data registers
-  /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
-  /// for initial register state.
-  ///
-  /// The total number of SGPRuser data registers requested must not
-  /// exceed 16. Any requests beyond 16 will be ignored.
-  ///
-  /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
-  /// SGPR user data registers enabled up to 16).
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
-
-  /// Control wave ID base counter for GDS ordered-append. Used to set
-  /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
-  /// ORDERED_APPEND_MODE also needs to be settable)
-  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10,
-  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
-
-  /// The interleave (swizzle) element size in bytes required by the
-  /// code for private memory. This must be 2, 4, 8 or 16. This value
-  /// is provided to the finalizer when it is invoked and is recorded
-  /// here. The hardware will interleave the memory requests of each
-  /// lane of a wavefront by this element size to ensure each
-  /// work-item gets a distinct memory memory location. Therefore, the
-  /// finalizer ensures that all load and store operations done to
-  /// private memory do not exceed this size. For example, if the
-  /// element size is 4 (32-bits or dword) and a 64-bit value must be
-  /// loaded, the finalizer will generate two 32-bit loads. This
-  /// ensures that the interleaving will get the the work-item
-  /// specific dword for both halves of the 64-bit value. If it just
-  /// did a 64-bit load then it would get one dword which belonged to
-  /// its own work-item, but the second dword would belong to the
-  /// adjacent lane work-item since the interleaving is in dwords.
-  ///
-  /// The value used must match the value that the runtime configures
-  /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
-  /// is generally DWORD.
-  ///
-  /// Use values from the amd_element_byte_size_t enum.
-  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11,
-  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
-  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
-
-  /// Are global memory addresses 64 bits. Must match
-  /// amd_kernel_code_t.hsail_machine_model ==
-  /// HSA_MACHINE_LARGE. Must also match
-  /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
-  /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
-  AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13,
-  AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
-  AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
-
-  /// Indicate if the generated ISA is using a dynamically sized call
-  /// stack. This can happen if calls are implemented using a call
-  /// stack and recursion, alloca or calls to indirect functions are
-  /// present. In these cases the Finalizer cannot compute the total
-  /// private segment size at compile time. In this case the
-  /// workitem_private_segment_byte_size only specifies the statically
-  /// know private segment size, and additional space must be added
-  /// for the call stack.
-  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14,
-  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
-  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
-
-  /// Indicate if code generated has support for debugging.
-  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15,
-  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
-  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT
-};
-
-/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL
-/// control directives. These control how the finalizer generates code. This
-/// struct is used both as an argument to hsaFinalizeKernel to specify values for
-/// the control directives, and is used in HsaKernelCode to record the values of
-/// the control directives that the finalize used when generating the code which
-/// either came from the finalizer argument or explicit HSAIL control
-/// directives. See the definition of the control directives in HSA Programmer's
-/// Reference Manual which also defines how the values specified as finalizer
-/// arguments have to agree with the control directives in the HSAIL code.
-typedef struct hsa_ext_control_directives_s {
-  /// This is a bit set indicating which control directives have been
-  /// specified. If the value is 0 then there are no control directives specified
-  /// and the rest of the fields can be ignored. The bits are accessed using the
-  /// hsa_ext_control_directives_present_mask_t. Any control directive that is not
-  /// enabled in this bit set must have the value of all 0s.
-  hsa_ext_control_directive_present64_t enabled_control_directives;
-
-  /// If enableBreakExceptions is not enabled then must be 0, otherwise must be
-  /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK
-  /// policy enabled. If this set is not empty then the generated code may have
-  /// lower performance than if the set is empty. If the kernel being finalized
-  /// has any enablebreakexceptions control directives, then the values specified
-  /// by this argument are unioned with the values in these control
-  /// directives. If any of the functions the kernel calls have an
-  /// enablebreakexceptions control directive, then they must be equal or a
-  /// subset of, this union.
-  hsa_ext_exception_kind16_t enable_break_exceptions;
-
-  /// If enableDetectExceptions is not enabled then must be 0, otherwise must be
-  /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT
-  /// policy enabled. If this set is not empty then the generated code may have
-  /// lower performance than if the set is empty. However, an implementation
-  /// should endeavour to make the performance impact small. If the kernel being
-  /// finalized has any enabledetectexceptions control directives, then the
-  /// values specified by this argument are unioned with the values in these
-  /// control directives. If any of the functions the kernel calls have an
-  /// enabledetectexceptions control directive, then they must be equal or a
-  /// subset of, this union.
-  hsa_ext_exception_kind16_t enable_detect_exceptions;
-
-  /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of
-  /// dynamic group segment can be allocated for a dispatch, otherwise the value
-  /// specifies the maximum number of bytes of dynamic group segment that can be
-  /// allocated for a dispatch. If the kernel being finalized has any
-  /// maxdynamicsize control directives, then the values must be the same, and
-  /// must be the same as this argument if it is enabled. This value can be used
-  /// by the finalizer to determine the maximum number of bytes of group memory
-  /// used by each work-group by adding this value to the group memory required
-  /// for all group segment variables used by the kernel and all functions it
-  /// calls, and group memory used to implement other HSAIL features such as
-  /// fbarriers and the detect exception operations. This can allow the finalizer
-  /// to determine the expected number of work-groups that can be executed by a
-  /// compute unit and allow more resources to be allocated to the work-items if
-  /// it is known that fewer work-groups can be executed due to group memory
-  /// limitations.
-  uint32_t max_dynamic_group_size;
-
-  /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater
-  /// than 0. See HSA Programmer's Reference Manual description of
-  /// maxflatgridsize control directive.
-  uint32_t max_flat_grid_size;
-
-  /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be
-  /// greater than 0. See HSA Programmer's Reference Manual description of
-  /// maxflatworkgroupsize control directive.
-  uint32_t max_flat_workgroup_size;
-
-  /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the
-  /// finalizer is free to generate ISA that may result in any number of
-  /// work-groups executing on a single compute unit. Otherwise, the finalizer
-  /// should attempt to generate ISA that will allow the specified number of
-  /// work-groups to execute on a single compute unit. This is only a hint and
-  /// can be ignored by the finalizer. If the kernel being finalized, or any of
-  /// the functions it calls, has a requested control directive, then the values
-  /// must be the same. This can be used to determine the number of resources
-  /// that should be allocated to a single work-group and work-item. For example,
-  /// a low value may allow more resources to be allocated, resulting in higher
-  /// per work-item performance, as it is known there will never be more than the
-  /// specified number of work-groups actually executing on the compute
-  /// unit. Conversely, a high value may allocate fewer resources, resulting in
-  /// lower per work-item performance, which is offset by the fact it allows more
-  /// work-groups to actually execute on the compute unit.
-  uint32_t requested_workgroups_per_cu;
-
-  /// If not enabled then all elements for Dim3 must be 0, otherwise every
-  /// element must be greater than 0. See HSA Programmer's Reference Manual
-  /// description of requiredgridsize control directive.
-  hsa_dim3_t required_grid_size;
-
-  /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be
-  /// 0, and the produced code can be dispatched with any legal work-group range
-  /// consistent with the dispatch dimensions. Otherwise, the code produced must
-  /// always be dispatched with the specified work-group range. No element of the
-  /// specified range must be 0. It must be consistent with required_dimensions
-  /// and max_flat_workgroup_size. If the kernel being finalized, or any of the
-  /// functions it calls, has a requiredworkgroupsize control directive, then the
-  /// values must be the same. Specifying a value can allow the finalizer to
-  /// optimize work-group id operations, and if the number of work-items in the
-  /// work-group is less than the WAVESIZE then barrier operations can be
-  /// optimized to just a memory fence.
-  hsa_dim3_t required_workgroup_size;
-
-  /// If requiredDim is not enabled then must be 0 and the produced kernel code
-  /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is
-  /// 1..3 and the code produced must only be dispatched with a dimension that
-  /// matches. Other values are illegal. If the kernel being finalized, or any of
-  /// the functions it calls, has a requireddimsize control directive, then the
-  /// values must be the same. This can be used to optimize the code generated to
-  /// compute the absolute and flat work-group and work-item id, and the dim
-  /// HSAIL operations.
-  uint8_t required_dim;
-
-  /// Reserved. Must be 0.
-  uint8_t reserved[75];
-} hsa_ext_control_directives_t;
-
-/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel
-/// Code Object to set up the hardware to execute the kernel dispatch.
-///
-/// Initial Kernel Register State.
-///
-/// Initial kernel register state will be set up by CP/SPI prior to the start
-/// of execution of every wavefront. This is limited by the constraints of the
-/// current hardware.
-///
-/// The order of the SGPR registers is defined, but the Finalizer can specify
-/// which ones are actually setup in the amd_kernel_code_t object using the
-/// enable_sgpr_* bit fields. The register numbers used for enabled registers
-/// are dense starting at SGPR0: the first enabled register is SGPR0, the next
-/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR
-/// number.
-///
-/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and
-/// apply to all waves of the grid. It is possible to specify more than 16 User
-/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16
-/// are actually initialized. These are then immediately followed by the System
-/// SGPRs that are set up by ADC/SPI and can have different values for each wave
-/// of the grid dispatch.
-///
-/// SGPR register initial state is defined as follows:
-///
-/// Private Segment Buffer (enable_sgpr_private_segment_buffer):
-///   Number of User SGPR registers: 4. V# that can be used, together with
-///   Scratch Wave Offset as an offset, to access the Private/Spill/Arg
-///   segments using a segment address. It must be set as follows:
-///     - Base address: of the scratch memory area used by the dispatch. It
-///       does not include the scratch wave offset. It will be the per process
-///       SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for
-///       example there may be a per pipe offset, or per AQL Queue offset).
-///     - Stride + data_format: Element Size * Index Stride (???)
-///     - Cache swizzle: ???
-///     - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for
-///       scratch)
-///     - Num records: Flat Scratch Work Item Size / Element Size (???)
-///     - Dst_sel_*: ???
-///     - Num_format: ???
-///     - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must
-///       agree with amd_kernel_code_t.privateElementSize)
-///     - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must
-///       be number of wavefront lanes for scratch, must agree with
-///       amd_kernel_code_t.wavefrontSize)
-///     - Add tid enable: 1
-///     - ATC: from SH_MEM_CONFIG.PRIVATE_ATC,
-///     - Hash_enable: ???
-///     - Heap: ???
-///     - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE
-///     - Type: 0 (a buffer) (???)
-///
-/// Dispatch Ptr (enable_sgpr_dispatch_ptr):
-///   Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet
-///   for kernel actually executing.
-///
-/// Queue Ptr (enable_sgpr_queue_ptr):
-///   Number of User SGPR registers: 2. 64 bit address of AmdQueue object for
-///   AQL queue on which the dispatch packet was queued.
-///
-/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr):
-///   Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This
-///   is directly copied from the kernargPtr in the dispatch packet. Having CP
-///   load it once avoids loading it at the beginning of every wavefront.
-///
-/// Dispatch Id (enable_sgpr_dispatch_id):
-///   Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch
-///   packet being executed.
-///
-/// Flat Scratch Init (enable_sgpr_flat_scratch_init):
-///   Number of User SGPR registers: 2. This is 2 SGPRs.
-///
-///   For CI/VI:
-///     The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE
-///     to base of memory for scratch for this dispatch. This is the same offset
-///     used in computing the Scratch Segment Buffer base address. The value of
-///     Scratch Wave Offset must be added by the kernel code and moved to
-///     SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions.
-///
-///     The second SGPR is 32 bit byte size of a single work-items scratch
-///     memory usage. This is directly loaded from the dispatch packet Private
-///     Segment Byte Size and rounded up to a multiple of DWORD.
-///
-///     \todo [Does CP need to round this to >4 byte alignment?]
-///
-///     The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in
-///     flat memory instructions. Having CP load it once avoids loading it at
-///     the beginning of every wavefront.
-///
-///   For PI:
-///     This is the 64 bit base address of the scratch backing memory for
-///     allocated by CP for this dispatch.
-///
-/// Private Segment Size (enable_sgpr_private_segment_size):
-///   Number of User SGPR registers: 1. The 32 bit byte size of a single
-///   work-items scratch memory allocation. This is the value from the dispatch
-///   packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD.
-///
-///   \todo [Does CP need to round this to >4 byte alignment?]
-///
-///   Having CP load it once avoids loading it at the beginning of every
-///   wavefront.
-///
-///   \todo [This will not be used for CI/VI since it is the same value as
-///   the second SGPR of Flat Scratch Init. However, it is need for PI which
-///   changes meaning of Flat Scratchg Init..]
-///
-/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x):
-///   Number of User SGPR registers: 1. 32 bit count of the number of
-///   work-groups in the X dimension for the grid being executed. Computed from
-///   the fields in the HsaDispatchPacket as
-///   ((gridSize.x+workgroupSize.x-1)/workgroupSize.x).
-///
-/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y):
-///   Number of User SGPR registers: 1. 32 bit count of the number of
-///   work-groups in the Y dimension for the grid being executed. Computed from
-///   the fields in the HsaDispatchPacket as
-///   ((gridSize.y+workgroupSize.y-1)/workgroupSize.y).
-///
-///   Only initialized if <16 previous SGPRs initialized.
-///
-/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z):
-///   Number of User SGPR registers: 1. 32 bit count of the number of
-///   work-groups in the Z dimension for the grid being executed. Computed
-///   from the fields in the HsaDispatchPacket as
-///   ((gridSize.z+workgroupSize.z-1)/workgroupSize.z).
-///
-///   Only initialized if <16 previous SGPRs initialized.
-///
-/// Work-Group Id X (enable_sgpr_workgroup_id_x):
-///   Number of System SGPR registers: 1. 32 bit work group id in X dimension
-///   of grid for wavefront. Always present.
-///
-/// Work-Group Id Y (enable_sgpr_workgroup_id_y):
-///   Number of System SGPR registers: 1. 32 bit work group id in Y dimension
-///   of grid for wavefront.
-///
-/// Work-Group Id Z (enable_sgpr_workgroup_id_z):
-///   Number of System SGPR registers: 1. 32 bit work group id in Z dimension
-///   of grid for wavefront. If present then Work-group Id Y will also be
-///   present
-///
-/// Work-Group Info (enable_sgpr_workgroup_info):
-///   Number of System SGPR registers: 1. {first_wave, 14b0000,
-///   ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}
-///
-/// Private Segment Wave Byte Offset
-/// (enable_sgpr_private_segment_wave_byte_offset):
-///   Number of System SGPR registers: 1. 32 bit byte offset from base of
-///   dispatch scratch base. Must be used as an offset with Private/Spill/Arg
-///   segment address when using Scratch Segment Buffer. It must be added to
-///   Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing.
-///
-///
-/// The order of the VGPR registers is defined, but the Finalizer can specify
-/// which ones are actually setup in the amd_kernel_code_t object using the
-/// enableVgpr*  bit fields. The register numbers used for enabled registers
-/// are dense starting at VGPR0: the first enabled register is VGPR0, the next
-/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR
-/// number.
-///
-/// VGPR register initial state is defined as follows:
-///
-/// Work-Item Id X (always initialized):
-///   Number of registers: 1. 32 bit work item id in X dimension of work-group
-///   for wavefront lane.
-///
-/// Work-Item Id X (enable_vgpr_workitem_id > 0):
-///   Number of registers: 1. 32 bit work item id in Y dimension of work-group
-///   for wavefront lane.
-///
-/// Work-Item Id X (enable_vgpr_workitem_id > 0):
-///   Number of registers: 1. 32 bit work item id in Z dimension of work-group
-///   for wavefront lane.
-///
-///
-/// The setting of registers is being done by existing GPU hardware as follows:
-///   1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data
-///      registers.
-///   2) Work-group Id registers X, Y, Z are set by SPI which supports any
-///      combination including none.
-///   3) Scratch Wave Offset is also set by SPI which is why its value cannot
-///      be added into the value Flat Scratch Offset which would avoid the
-///      Finalizer generated prolog having to do the add.
-///   4) The VGPRs are set by SPI which only supports specifying either (X),
-///      (X, Y) or (X, Y, Z).
-///
-/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so
-/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and
-/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register.
-///
-/// The global segment can be accessed either using flat operations or buffer
-/// operations. If buffer operations are used then the Global Buffer used to
-/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a
-/// segment address is not passed into the kernel code by CP since its base
-/// address is always 0. Instead the Finalizer generates prolog code to
-/// initialize 4 SGPRs with a V# that has the following properties, and then
-/// uses that in the buffer instructions:
-///   - base address of 0
-///   - no swizzle
-///   - ATC=1
-///   - MTYPE set to support memory coherence specified in
-///     amd_kernel_code_t.globalMemoryCoherence
-///
-/// When the Global Buffer is used to access the Kernarg segment, must add the
-/// dispatch packet kernArgPtr to a kernarg segment address before using this V#.
-/// Alternatively scalar loads can be used if the kernarg offset is uniform, as
-/// the kernarg segment is constant for the duration of the kernel execution.
-///
-typedef struct amd_kernel_code_s {
-  /// The AMD major version of the Code Object. Must be the value
-  /// AMD_CODE_VERSION_MAJOR.
-  amd_code_version32_t amd_code_version_major;
-
-  /// The AMD minor version of the Code Object. Minor versions must be
-  /// backward compatible. Must be the value
-  /// AMD_CODE_VERSION_MINOR.
-  amd_code_version32_t amd_code_version_minor;
-
-  /// The byte size of this struct. Must be set to
-  /// sizeof(amd_kernel_code_t). Used for backward
-  /// compatibility.
-  uint32_t struct_byte_size;
-
-  /// The target chip instruction set for which code has been
-  /// generated. Values are from the E_SC_INSTRUCTION_SET enumeration
-  /// in sc/Interface/SCCommon.h.
-  uint32_t target_chip;
-
-  /// Byte offset (possibly negative) from start of amd_kernel_code_t
-  /// object to kernel's entry point instruction. The actual code for
-  /// the kernel is required to be 256 byte aligned to match hardware
-  /// requirements (SQ cache line is 16). The code must be position
-  /// independent code (PIC) for AMD devices to give runtime the
-  /// option of copying code to discrete GPU memory or APU L2
-  /// cache. The Finalizer should endeavour to allocate all kernel
-  /// machine code in contiguous memory pages so that a device
-  /// pre-fetcher will tend to only pre-fetch Kernel Code objects,
-  /// improving cache performance.
-  int64_t kernel_code_entry_byte_offset;
-
-  /// Range of bytes to consider prefetching expressed as an offset
-  /// and size. The offset is from the start (possibly negative) of
-  /// amd_kernel_code_t object. Set both to 0 if no prefetch
-  /// information is available.
-  ///
-  /// \todo ttye 11/15/2013 Is the prefetch definition we want? Did
-  /// not make the size a uint64_t as prefetching more than 4GiB seems
-  /// excessive.
-  int64_t kernel_code_prefetch_byte_offset;
-  uint64_t kernel_code_prefetch_byte_size;
-
-  /// Number of bytes of scratch backing memory required for full
-  /// occupancy of target chip. This takes into account the number of
-  /// bytes of scratch per work-item, the wavefront size, the maximum
-  /// number of wavefronts per CU, and the number of CUs. This is an
-  /// upper limit on scratch. If the grid being dispatched is small it
-  /// may only need less than this. If the kernel uses no scratch, or
-  /// the Finalizer has not computed this value, it must be 0.
-  uint64_t max_scratch_backing_memory_byte_size;
-
-  /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
-  /// COMPUTE_PGM_RSRC2 registers.
-  amd_compute_pgm_resource_register64_t compute_pgm_resource_registers;
-
-  /// Code properties. See amd_code_property_mask_t for a full list of
-  /// properties.
-  amd_code_property32_t code_properties;
-
-  /// The amount of memory required for the combined private, spill
-  /// and arg segments for a work-item in bytes. If
-  /// is_dynamic_callstack is 1 then additional space must be added to
-  /// this value for the call stack.
-  uint32_t workitem_private_segment_byte_size;
-
-  /// The amount of group segment memory required by a work-group in
-  /// bytes. This does not include any dynamically allocated group
-  /// segment memory that may be added when the kernel is
-  /// dispatched.
-  uint32_t workgroup_group_segment_byte_size;
-
-  /// Number of byte of GDS required by kernel dispatch. Must be 0 if
-  /// not using GDS.
-  uint32_t gds_segment_byte_size;
-
-  /// The size in bytes of the kernarg segment that holds the values
-  /// of the arguments to the kernel. This could be used by CP to
-  /// prefetch the kernarg segment pointed to by the dispatch packet.
-  uint64_t kernarg_segment_byte_size;
-
-  /// Number of fbarrier's used in the kernel and all functions it
-  /// calls. If the implementation uses group memory to allocate the
-  /// fbarriers then that amount must already be included in the
-  /// workgroup_group_segment_byte_size total.
-  uint32_t workgroup_fbarrier_count;
-
-  /// Number of scalar registers used by a wavefront. This includes
-  /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
-  /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
-  /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
-  uint16_t wavefront_sgpr_count;
-
-  /// Number of vector registers used by each work-item. Used to set
-  /// COMPUTE_PGM_RSRC1.VGPRS.
-  uint16_t workitem_vgpr_count;
-
-  /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
-  /// first fixed VGPR number reserved.
-  uint16_t reserved_vgpr_first;
-
-  /// The number of consecutive VGPRs reserved by the client. If
-  /// is_debug_supported then this count includes VGPRs reserved
-  /// for debugger use.
-  uint16_t reserved_vgpr_count;
-
-  /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
-  /// first fixed SGPR number reserved.
-  uint16_t reserved_sgpr_first;
-
-  /// The number of consecutive SGPRs reserved by the client. If
-  /// is_debug_supported then this count includes SGPRs reserved
-  /// for debugger use.
-  uint16_t reserved_sgpr_count;
-
-  /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
-  /// fixed SGPR number used to hold the wave scratch offset for the
-  /// entire kernel execution, or uint16_t(-1) if the register is not
-  /// used or not known.
-  uint16_t debug_wavefront_private_segment_offset_sgpr;
-
-  /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
-  /// fixed SGPR number of the first of 4 SGPRs used to hold the
-  /// scratch V# used for the entire kernel execution, or uint16_t(-1)
-  /// if the registers are not used or not known.
-  uint16_t debug_private_segment_buffer_sgpr;
-
-  /// The maximum byte alignment of variables used by the kernel in
-  /// the specified memory segment. Expressed as a power of two. Must
-  /// be at least HSA_POWERTWO_16.
-  hsa_powertwo8_t kernarg_segment_alignment;
-  hsa_powertwo8_t group_segment_alignment;
-  hsa_powertwo8_t private_segment_alignment;
-
-  uint8_t reserved3;
-
-  /// Type of code object.
-  hsa_ext_code_kind32_t code_type;
-
-  /// Reserved for code properties if any are defined in the future.
-  /// There are currently no code properties so this field must be 0.
-  uint32_t reserved4;
-
-  /// Wavefront size expressed as a power of two. Must be a power of 2
-  /// in range 1..64 inclusive. Used to support runtime query that
-  /// obtains wavefront size, which may be used by application to
-  /// allocated dynamic group memory and set the dispatch work-group
-  /// size.
-  hsa_powertwo8_t wavefront_size;
-
-  /// The optimization level specified when the kernel was
-  /// finalized.
-  uint8_t optimization_level;
-
-  /// The HSAIL profile defines which features are used. This
-  /// information is from the HSAIL version directive. If this
-  /// amd_kernel_code_t is not generated from an HSAIL compilation
-  /// unit then must be 0.
-  hsa_ext_brig_profile8_t hsail_profile;
-
-  /// The HSAIL machine model gives the address sizes used by the
-  /// code. This information is from the HSAIL version directive. If
-  /// not generated from an HSAIL compilation unit then must still
-  /// indicate for what machine mode the code is generated.
-  hsa_ext_brig_machine_model8_t hsail_machine_model;
-
-  /// The HSAIL major version. This information is from the HSAIL
-  /// version directive. If this amd_kernel_code_t is not
-  /// generated from an HSAIL compilation unit then must be 0.
-  uint32_t hsail_version_major;
-
-  /// The HSAIL minor version. This information is from the HSAIL
-  /// version directive. If this amd_kernel_code_t is not
-  /// generated from an HSAIL compilation unit then must be 0.
-  uint32_t hsail_version_minor;
-
-  /// Reserved for HSAIL target options if any are defined in the
-  /// future. There are currently no target options so this field
-  /// must be 0.
-  uint16_t reserved5;
-
-  /// Reserved. Must be 0.
-  uint16_t reserved6;
-
-  /// The values should be the actually values used by the finalizer
-  /// in generating the code. This may be the union of values
-  /// specified as finalizer arguments and explicit HSAIL control
-  /// directives. If the finalizer chooses to ignore a control
-  /// directive, and not generate constrained code, then the control
-  /// directive should not be marked as enabled even though it was
-  /// present in the HSAIL or finalizer argument. The values are
-  /// intended to reflect the constraints that the code actually
-  /// requires to correctly execute, not the values that were
-  /// actually specified at finalize time.
-  hsa_ext_control_directives_t control_directive;
-
-  /// The code can immediately follow the amd_kernel_code_t, or can
-  /// come after subsequent amd_kernel_code_t structs when there are
-  /// multiple kernels in the compilation unit.
-
-} amd_kernel_code_t;
-
-#endif // AMDKERNELCODET_H
diff --git a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp
deleted file mode 100644
index 02a63604970..00000000000
--- a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp
+++ /dev/null
@@ -1,1380 +0,0 @@
-//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIDefines.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/MC/MCParser/MCAsmParser.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Debug.h"
-
-using namespace llvm;
-
-namespace {
-
-struct OptionalOperand;
-
-class AMDGPUOperand : public MCParsedAsmOperand {
-  enum KindTy {
-    Token,
-    Immediate,
-    Register,
-    Expression
-  } Kind;
-
-  SMLoc StartLoc, EndLoc;
-
-public:
-  AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {}
-
-  MCContext *Ctx;
-
-  enum ImmTy {
-    ImmTyNone,
-    ImmTyDSOffset0,
-    ImmTyDSOffset1,
-    ImmTyGDS,
-    ImmTyOffset,
-    ImmTyGLC,
-    ImmTySLC,
-    ImmTyTFE,
-    ImmTyClamp,
-    ImmTyOMod
-  };
-
-  struct TokOp {
-    const char *Data;
-    unsigned Length;
-  };
-
-  struct ImmOp {
-    bool IsFPImm;
-    ImmTy Type;
-    int64_t Val;
-  };
-
-  struct RegOp {
-    unsigned RegNo;
-    int Modifiers;
-    const MCRegisterInfo *TRI;
-    bool IsForcedVOP3;
-  };
-
-  union {
-    TokOp Tok;
-    ImmOp Imm;
-    RegOp Reg;
-    const MCExpr *Expr;
-  };
-
-  void addImmOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::createImm(getImm()));
-  }
-
-  StringRef getToken() const {
-    return StringRef(Tok.Data, Tok.Length);
-  }
-
-  void addRegOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::createReg(getReg()));
-  }
-
-  void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
-    if (isReg())
-      addRegOperands(Inst, N);
-    else
-      addImmOperands(Inst, N);
-  }
-
-  void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::createImm(
-        Reg.Modifiers == -1 ? 0 : Reg.Modifiers));
-    addRegOperands(Inst, N);
-  }
-
-  void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const {
-    if (isImm())
-      addImmOperands(Inst, N);
-    else {
-      assert(isExpr());
-      Inst.addOperand(MCOperand::createExpr(Expr));
-    }
-  }
-
-  bool defaultTokenHasSuffix() const {
-    StringRef Token(Tok.Data, Tok.Length);
-
-    return Token.endswith("_e32") || Token.endswith("_e64");
-  }
-
-  bool isToken() const override {
-    return Kind == Token;
-  }
-
-  bool isImm() const override {
-    return Kind == Immediate;
-  }
-
-  bool isInlineImm() const {
-    float F = BitsToFloat(Imm.Val);
-    // TODO: Add 0.5pi for VI
-    return isImm() && ((Imm.Val <= 64 && Imm.Val >= -16) ||
-           (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 ||
-           F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0));
-  }
-
-  bool isDSOffset0() const {
-    assert(isImm());
-    return Imm.Type == ImmTyDSOffset0;
-  }
-
-  bool isDSOffset1() const {
-    assert(isImm());
-    return Imm.Type == ImmTyDSOffset1;
-  }
-
-  int64_t getImm() const {
-    return Imm.Val;
-  }
-
-  enum ImmTy getImmTy() const {
-    assert(isImm());
-    return Imm.Type;
-  }
-
-  bool isRegKind() const {
-    return Kind == Register;
-  }
-
-  bool isReg() const override {
-    return Kind == Register && Reg.Modifiers == -1;
-  }
-
-  bool isRegWithInputMods() const {
-    return Kind == Register && (Reg.IsForcedVOP3 || Reg.Modifiers != -1);
-  }
-
-  void setModifiers(unsigned Mods) {
-    assert(isReg());
-    Reg.Modifiers = Mods;
-  }
-
-  bool hasModifiers() const {
-    assert(isRegKind());
-    return Reg.Modifiers != -1;
-  }
-
-  unsigned getReg() const override {
-    return Reg.RegNo;
-  }
-
-  bool isRegOrImm() const {
-    return isReg() || isImm();
-  }
-
-  bool isRegClass(unsigned RCID) const {
-    return Reg.TRI->getRegClass(RCID).contains(getReg());
-  }
-
-  bool isSCSrc32() const {
-    return isInlineImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID));
-  }
-
-  bool isSSrc32() const {
-    return isImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID));
-  }
-
-  bool isSSrc64() const {
-    return isImm() || isInlineImm() ||
-           (isReg() && isRegClass(AMDGPU::SReg_64RegClassID));
-  }
-
-  bool isVCSrc32() const {
-    return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID));
-  }
-
-  bool isVCSrc64() const {
-    return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID));
-  }
-
-  bool isVSrc32() const {
-    return isImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID));
-  }
-
-  bool isVSrc64() const {
-    return isImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID));
-  }
-
-  bool isMem() const override {
-    return false;
-  }
-
-  bool isExpr() const {
-    return Kind == Expression;
-  }
-
-  bool isSoppBrTarget() const {
-    return isExpr() || isImm();
-  }
-
-  SMLoc getStartLoc() const override {
-    return StartLoc;
-  }
-
-  SMLoc getEndLoc() const override {
-    return EndLoc;
-  }
-
-  void print(raw_ostream &OS) const override { }
-
-  static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc,
-                                                  enum ImmTy Type = ImmTyNone,
-                                                  bool IsFPImm = false) {
-    auto Op = llvm::make_unique<AMDGPUOperand>(Immediate);
-    Op->Imm.Val = Val;
-    Op->Imm.IsFPImm = IsFPImm;
-    Op->Imm.Type = Type;
-    Op->StartLoc = Loc;
-    Op->EndLoc = Loc;
-    return Op;
-  }
-
-  static std::unique_ptr<AMDGPUOperand> CreateToken(StringRef Str, SMLoc Loc,
-                                           bool HasExplicitEncodingSize = true) {
-    auto Res = llvm::make_unique<AMDGPUOperand>(Token);
-    Res->Tok.Data = Str.data();
-    Res->Tok.Length = Str.size();
-    Res->StartLoc = Loc;
-    Res->EndLoc = Loc;
-    return Res;
-  }
-
-  static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S,
-                                                  SMLoc E,
-                                                  const MCRegisterInfo *TRI,
-                                                  bool ForceVOP3) {
-    auto Op = llvm::make_unique<AMDGPUOperand>(Register);
-    Op->Reg.RegNo = RegNo;
-    Op->Reg.TRI = TRI;
-    Op->Reg.Modifiers = -1;
-    Op->Reg.IsForcedVOP3 = ForceVOP3;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static std::unique_ptr<AMDGPUOperand> CreateExpr(const class MCExpr *Expr, SMLoc S) {
-    auto Op = llvm::make_unique<AMDGPUOperand>(Expression);
-    Op->Expr = Expr;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  bool isDSOffset() const;
-  bool isDSOffset01() const;
-  bool isSWaitCnt() const;
-  bool isMubufOffset() const;
-};
-
-class AMDGPUAsmParser : public MCTargetAsmParser {
-  MCSubtargetInfo &STI;
-  const MCInstrInfo &MII;
-  MCAsmParser &Parser;
-
-  unsigned ForcedEncodingSize;
-  /// @name Auto-generated Match Functions
-  /// {
-
-#define GET_ASSEMBLER_HEADER
-#include "AMDGPUGenAsmMatcher.inc"
-
-  /// }
-
-public:
-  AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &_Parser,
-               const MCInstrInfo &MII,
-               const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(STI), MII(MII), Parser(_Parser),
-        ForcedEncodingSize(0){
-
-    if (STI.getFeatureBits().none()) {
-      // Set default features.
-      STI.ToggleFeature("SOUTHERN_ISLANDS");
-    }
-
-    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
-  }
-
-  unsigned getForcedEncodingSize() const {
-    return ForcedEncodingSize;
-  }
-
-  void setForcedEncodingSize(unsigned Size) {
-    ForcedEncodingSize = Size;
-  }
-
-  bool isForcedVOP3() const {
-    return ForcedEncodingSize == 64;
-  }
-
-  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
-  unsigned checkTargetMatchPredicate(MCInst &Inst) override;
-  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               OperandVector &Operands, MCStreamer &Out,
-                               uint64_t &ErrorInfo,
-                               bool MatchingInlineAsm) override;
-  bool ParseDirective(AsmToken DirectiveID) override;
-  OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic);
-  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                        SMLoc NameLoc, OperandVector &Operands) override;
-
-  OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int,
-                                          int64_t Default = 0);
-  OperandMatchResultTy parseIntWithPrefix(const char *Prefix,
-                                          OperandVector &Operands,
-                                          enum AMDGPUOperand::ImmTy ImmTy =
-                                                      AMDGPUOperand::ImmTyNone);
-  OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands,
-                                     enum AMDGPUOperand::ImmTy ImmTy =
-                                                      AMDGPUOperand::ImmTyNone);
-  OperandMatchResultTy parseOptionalOps(
-                                   const ArrayRef<OptionalOperand> &OptionalOps,
-                                   OperandVector &Operands);
-
-
-  void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
-  void cvtDS(MCInst &Inst, const OperandVector &Operands);
-  OperandMatchResultTy parseDSOptionalOps(OperandVector &Operands);
-  OperandMatchResultTy parseDSOff01OptionalOps(OperandVector &Operands);
-  OperandMatchResultTy parseDSOffsetOptional(OperandVector &Operands);
-
-  bool parseCnt(int64_t &IntVal);
-  OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands);
-  OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
-
-  OperandMatchResultTy parseFlatOptionalOps(OperandVector &Operands);
-  OperandMatchResultTy parseFlatAtomicOptionalOps(OperandVector &Operands);
-  void cvtFlat(MCInst &Inst, const OperandVector &Operands);
-
-  void cvtMubuf(MCInst &Inst, const OperandVector &Operands);
-  OperandMatchResultTy parseOffset(OperandVector &Operands);
-  OperandMatchResultTy parseMubufOptionalOps(OperandVector &Operands);
-  OperandMatchResultTy parseGLC(OperandVector &Operands);
-  OperandMatchResultTy parseSLC(OperandVector &Operands);
-  OperandMatchResultTy parseTFE(OperandVector &Operands);
-
-  OperandMatchResultTy parseDMask(OperandVector &Operands);
-  OperandMatchResultTy parseUNorm(OperandVector &Operands);
-  OperandMatchResultTy parseR128(OperandVector &Operands);
-
-  void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
-  OperandMatchResultTy parseVOP3OptionalOps(OperandVector &Operands);
-};
-
-struct OptionalOperand {
-  const char *Name;
-  AMDGPUOperand::ImmTy Type;
-  bool IsBit;
-  int64_t Default;
-  bool (*ConvertResult)(int64_t&);
-};
-
-}
-
-static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) {
-  if (IsVgpr) {
-    switch (RegWidth) {
-      default: llvm_unreachable("Unknown register width");
-      case 1: return AMDGPU::VGPR_32RegClassID;
-      case 2: return AMDGPU::VReg_64RegClassID;
-      case 3: return AMDGPU::VReg_96RegClassID;
-      case 4: return AMDGPU::VReg_128RegClassID;
-      case 8: return AMDGPU::VReg_256RegClassID;
-      case 16: return AMDGPU::VReg_512RegClassID;
-    }
-  }
-
-  switch (RegWidth) {
-    default: llvm_unreachable("Unknown register width");
-    case 1: return AMDGPU::SGPR_32RegClassID;
-    case 2: return AMDGPU::SGPR_64RegClassID;
-    case 4: return AMDGPU::SReg_128RegClassID;
-    case 8: return AMDGPU::SReg_256RegClassID;
-    case 16: return AMDGPU::SReg_512RegClassID;
-  }
-}
-
-static unsigned getRegForName(const StringRef &RegName) {
-
-  return StringSwitch<unsigned>(RegName)
-    .Case("exec", AMDGPU::EXEC)
-    .Case("vcc", AMDGPU::VCC)
-    .Case("flat_scr", AMDGPU::FLAT_SCR)
-    .Case("m0", AMDGPU::M0)
-    .Case("scc", AMDGPU::SCC)
-    .Case("flat_scr_lo", AMDGPU::FLAT_SCR_LO)
-    .Case("flat_scr_hi", AMDGPU::FLAT_SCR_HI)
-    .Case("vcc_lo", AMDGPU::VCC_LO)
-    .Case("vcc_hi", AMDGPU::VCC_HI)
-    .Case("exec_lo", AMDGPU::EXEC_LO)
-    .Case("exec_hi", AMDGPU::EXEC_HI)
-    .Default(0);
-}
-
-bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
-  const AsmToken Tok = Parser.getTok();
-  StartLoc = Tok.getLoc();
-  EndLoc = Tok.getEndLoc();
-  const StringRef &RegName = Tok.getString();
-  RegNo = getRegForName(RegName);
-
-  if (RegNo) {
-    Parser.Lex();
-    return false;
-  }
-
-  // Match vgprs and sgprs
-  if (RegName[0] != 's' && RegName[0] != 'v')
-    return true;
-
-  bool IsVgpr = RegName[0] == 'v';
-  unsigned RegWidth;
-  unsigned RegIndexInClass;
-  if (RegName.size() > 1) {
-    // We have a 32-bit register
-    RegWidth = 1;
-    if (RegName.substr(1).getAsInteger(10, RegIndexInClass))
-      return true;
-    Parser.Lex();
-  } else {
-    // We have a register greater than 32-bits.
-
-    int64_t RegLo, RegHi;
-    Parser.Lex();
-    if (getLexer().isNot(AsmToken::LBrac))
-      return true;
-
-    Parser.Lex();
-    if (getParser().parseAbsoluteExpression(RegLo))
-      return true;
-
-    if (getLexer().isNot(AsmToken::Colon))
-      return true;
-
-    Parser.Lex();
-    if (getParser().parseAbsoluteExpression(RegHi))
-      return true;
-
-    if (getLexer().isNot(AsmToken::RBrac))
-      return true;
-
-    Parser.Lex();
-    RegWidth = (RegHi - RegLo) + 1;
-    if (IsVgpr) {
-      // VGPR registers aren't aligned.
-      RegIndexInClass = RegLo;
-    } else {
-      // SGPR registers are aligned.  Max alignment is 4 dwords.
-      RegIndexInClass = RegLo / std::min(RegWidth, 4u);
-    }
-  }
-
-  const MCRegisterInfo *TRC = getContext().getRegisterInfo();
-  unsigned RC = getRegClass(IsVgpr, RegWidth);
-  if (RegIndexInClass > TRC->getRegClass(RC).getNumRegs())
-    return true;
-  RegNo = TRC->getRegClass(RC).getRegister(RegIndexInClass);
-  return false;
-}
-
-unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
-
-  uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
-
-  if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) ||
-      (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3)))
-    return Match_InvalidOperand;
-
-  return Match_Success;
-}
-
-
-bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                                              OperandVector &Operands,
-                                              MCStreamer &Out,
-                                              uint64_t &ErrorInfo,
-                                              bool MatchingInlineAsm) {
-  MCInst Inst;
-
-  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
-    default: break;
-    case Match_Success:
-      Inst.setLoc(IDLoc);
-      Out.EmitInstruction(Inst, STI);
-      return false;
-    case Match_MissingFeature:
-      return Error(IDLoc, "instruction not supported on this GPU");
-
-    case Match_MnemonicFail:
-      return Error(IDLoc, "unrecognized instruction mnemonic");
-
-    case Match_InvalidOperand: {
-      SMLoc ErrorLoc = IDLoc;
-      if (ErrorInfo != ~0ULL) {
-        if (ErrorInfo >= Operands.size()) {
-          if (isForcedVOP3()) {
-            // If 64-bit encoding has been forced we can end up with no
-            // clamp or omod operands if none of the registers have modifiers,
-            // so we need to add these to the operand list.
-            AMDGPUOperand &LastOp =
-                ((AMDGPUOperand &)*Operands[Operands.size() - 1]);
-            if (LastOp.isRegKind() ||
-               (LastOp.isImm() &&
-                LastOp.getImmTy() != AMDGPUOperand::ImmTyNone)) {
-              SMLoc S = Parser.getTok().getLoc();
-              Operands.push_back(AMDGPUOperand::CreateImm(0, S,
-                                 AMDGPUOperand::ImmTyClamp));
-              Operands.push_back(AMDGPUOperand::CreateImm(0, S,
-                                 AMDGPUOperand::ImmTyOMod));
-              bool Res = MatchAndEmitInstruction(IDLoc, Opcode, Operands,
-                                                 Out, ErrorInfo,
-                                                 MatchingInlineAsm);
-              if (!Res)
-                return Res;
-            }
-
-          }
-          return Error(IDLoc, "too few operands for instruction");
-        }
-
-        ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc();
-        if (ErrorLoc == SMLoc())
-          ErrorLoc = IDLoc;
-      }
-      return Error(ErrorLoc, "invalid operand for instruction");
-    }
-  }
-  llvm_unreachable("Implement any new match types added!");
-}
-
-bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
-  return true;
-}
-
-static bool operandsHaveModifiers(const OperandVector &Operands) {
-
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
-    const AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]);
-    if (Op.isRegKind() && Op.hasModifiers())
-      return true;
-    if (Op.isImm() && (Op.getImmTy() == AMDGPUOperand::ImmTyOMod ||
-                       Op.getImmTy() == AMDGPUOperand::ImmTyClamp))
-      return true;
-  }
-  return false;
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
-
-  // Try to parse with a custom parser
-  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
-
-  // If we successfully parsed the operand or if there as an error parsing,
-  // we are done.
-  //
-  // If we are parsing after we reach EndOfStatement then this means we
-  // are appending default values to the Operands list.  This is only done
-  // by custom parser, so we shouldn't continue on to the generic parsing.
-  if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail ||
-      getLexer().is(AsmToken::EndOfStatement))
-    return ResTy;
-
-  bool Negate = false, Abs = false;
-  if (getLexer().getKind()== AsmToken::Minus) {
-    Parser.Lex();
-    Negate = true;
-  }
-
-  if (getLexer().getKind() == AsmToken::Pipe) {
-    Parser.Lex();
-    Abs = true;
-  }
-
-  switch(getLexer().getKind()) {
-    case AsmToken::Integer: {
-      SMLoc S = Parser.getTok().getLoc();
-      int64_t IntVal;
-      if (getParser().parseAbsoluteExpression(IntVal))
-        return MatchOperand_ParseFail;
-      APInt IntVal32(32, IntVal);
-      if (IntVal32.getSExtValue() != IntVal) {
-        Error(S, "invalid immediate: only 32-bit values are legal");
-        return MatchOperand_ParseFail;
-      }
-
-      IntVal = IntVal32.getSExtValue();
-      if (Negate)
-        IntVal *= -1;
-      Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S));
-      return MatchOperand_Success;
-    }
-    case AsmToken::Real: {
-      // FIXME: We should emit an error if a double precisions floating-point
-      // value is used.  I'm not sure the best way to detect this.
-      SMLoc S = Parser.getTok().getLoc();
-      int64_t IntVal;
-      if (getParser().parseAbsoluteExpression(IntVal))
-        return MatchOperand_ParseFail;
-
-      APFloat F((float)BitsToDouble(IntVal));
-      if (Negate)
-        F.changeSign();
-      Operands.push_back(
-          AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S));
-      return MatchOperand_Success;
-    }
-    case AsmToken::Identifier: {
-      SMLoc S, E;
-      unsigned RegNo;
-      if (!ParseRegister(RegNo, S, E)) {
-
-        bool HasModifiers = operandsHaveModifiers(Operands);
-        unsigned Modifiers = 0;
-
-        if (Negate)
-          Modifiers |= 0x1;
-
-        if (Abs) {
-          if (getLexer().getKind() != AsmToken::Pipe)
-            return MatchOperand_ParseFail;
-          Parser.Lex();
-          Modifiers |= 0x2;
-        }
-
-        if (Modifiers && !HasModifiers) {
-          // We are adding a modifier to src1 or src2 and previous sources
-          // don't have modifiers, so we need to go back and empty modifers
-          // for each previous source.
-          for (unsigned PrevRegIdx = Operands.size() - 1; PrevRegIdx > 1;
-               --PrevRegIdx) {
-
-            AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[PrevRegIdx]);
-            RegOp.setModifiers(0);
-          }
-        }
-
-
-        Operands.push_back(AMDGPUOperand::CreateReg(
-            RegNo, S, E, getContext().getRegisterInfo(),
-            isForcedVOP3()));
-
-        if (HasModifiers || Modifiers) {
-          AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[Operands.size() - 1]);
-          RegOp.setModifiers(Modifiers);
-
-        }
-     }  else {
-      Operands.push_back(AMDGPUOperand::CreateToken(Parser.getTok().getString(),
-                                                    S));
-      Parser.Lex();
-     }
-     return MatchOperand_Success;
-    }
-    default:
-      return MatchOperand_NoMatch;
-  }
-}
-
-bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
-                                       StringRef Name,
-                                       SMLoc NameLoc, OperandVector &Operands) {
-
-  // Clear any forced encodings from the previous instruction.
-  setForcedEncodingSize(0);
-
-  if (Name.endswith("_e64"))
-    setForcedEncodingSize(64);
-  else if (Name.endswith("_e32"))
-    setForcedEncodingSize(32);
-
-  // Add the instruction mnemonic
-  Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc));
-
-  while (!getLexer().is(AsmToken::EndOfStatement)) {
-    AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name);
-
-    // Eat the comma or space if there is one.
-    if (getLexer().is(AsmToken::Comma))
-      Parser.Lex();
-
-    switch (Res) {
-      case MatchOperand_Success: break;
-      case MatchOperand_ParseFail: return Error(getLexer().getLoc(),
-                                                "failed parsing operand.");
-      case MatchOperand_NoMatch: return Error(getLexer().getLoc(),
-                                              "not a valid operand.");
-    }
-  }
-
-  // Once we reach end of statement, continue parsing so we can add default
-  // values for optional arguments.
-  AMDGPUAsmParser::OperandMatchResultTy Res;
-  while ((Res = parseOperand(Operands, Name)) != MatchOperand_NoMatch) {
-    if (Res != MatchOperand_Success)
-      return Error(getLexer().getLoc(), "failed parsing operand.");
-  }
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-// Utility functions
-//===----------------------------------------------------------------------===//
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int,
-                                    int64_t Default) {
-
-  // We are at the end of the statement, and this is a default argument, so
-  // use a default value.
-  if (getLexer().is(AsmToken::EndOfStatement)) {
-    Int = Default;
-    return MatchOperand_Success;
-  }
-
-  switch(getLexer().getKind()) {
-    default: return MatchOperand_NoMatch;
-    case AsmToken::Identifier: {
-      StringRef OffsetName = Parser.getTok().getString();
-      if (!OffsetName.equals(Prefix))
-        return MatchOperand_NoMatch;
-
-      Parser.Lex();
-      if (getLexer().isNot(AsmToken::Colon))
-        return MatchOperand_ParseFail;
-
-      Parser.Lex();
-      if (getLexer().isNot(AsmToken::Integer))
-        return MatchOperand_ParseFail;
-
-      if (getParser().parseAbsoluteExpression(Int))
-        return MatchOperand_ParseFail;
-      break;
-    }
-  }
-  return MatchOperand_Success;
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
-                                    enum AMDGPUOperand::ImmTy ImmTy) {
-
-  SMLoc S = Parser.getTok().getLoc();
-  int64_t Offset = 0;
-
-  AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Offset);
-  if (Res != MatchOperand_Success)
-    return Res;
-
-  Operands.push_back(AMDGPUOperand::CreateImm(Offset, S, ImmTy));
-  return MatchOperand_Success;
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
-                               enum AMDGPUOperand::ImmTy ImmTy) {
-  int64_t Bit = 0;
-  SMLoc S = Parser.getTok().getLoc();
-
-  // We are at the end of the statement, and this is a default argument, so
-  // use a default value.
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    switch(getLexer().getKind()) {
-      case AsmToken::Identifier: {
-        StringRef Tok = Parser.getTok().getString();
-        if (Tok == Name) {
-          Bit = 1;
-          Parser.Lex();
-        } else if (Tok.startswith("no") && Tok.endswith(Name)) {
-          Bit = 0;
-          Parser.Lex();
-        } else {
-          return MatchOperand_NoMatch;
-        }
-        break;
-      }
-      default:
-        return MatchOperand_NoMatch;
-    }
-  }
-
-  Operands.push_back(AMDGPUOperand::CreateImm(Bit, S, ImmTy));
-  return MatchOperand_Success;
-}
-
-static bool operandsHasOptionalOp(const OperandVector &Operands,
-                                  const OptionalOperand &OOp) {
-  for (unsigned i = 0; i < Operands.size(); i++) {
-    const AMDGPUOperand &ParsedOp = ((const AMDGPUOperand &)*Operands[i]);
-    if ((ParsedOp.isImm() && ParsedOp.getImmTy() == OOp.Type) ||
-        (ParsedOp.isToken() && ParsedOp.getToken() == OOp.Name))
-      return true;
-
-  }
-  return false;
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseOptionalOps(const ArrayRef<OptionalOperand> &OptionalOps,
-                                   OperandVector &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
-  for (const OptionalOperand &Op : OptionalOps) {
-    if (operandsHasOptionalOp(Operands, Op))
-      continue;
-    AMDGPUAsmParser::OperandMatchResultTy Res;
-    int64_t Value;
-    if (Op.IsBit) {
-      Res = parseNamedBit(Op.Name, Operands, Op.Type);
-      if (Res == MatchOperand_NoMatch)
-        continue;
-      return Res;
-    }
-
-    Res = parseIntWithPrefix(Op.Name, Value, Op.Default);
-
-    if (Res == MatchOperand_NoMatch)
-      continue;
-
-    if (Res != MatchOperand_Success)
-      return Res;
-
-    if (Op.ConvertResult && !Op.ConvertResult(Value)) {
-      return MatchOperand_ParseFail;
-    }
-
-    Operands.push_back(AMDGPUOperand::CreateImm(Value, S, Op.Type));
-    return MatchOperand_Success;
-  }
-  return MatchOperand_NoMatch;
-}
-
-//===----------------------------------------------------------------------===//
-// ds
-//===----------------------------------------------------------------------===//
-
-static const OptionalOperand DSOptionalOps [] = {
-  {"offset",  AMDGPUOperand::ImmTyOffset, false, 0, nullptr},
-  {"gds",     AMDGPUOperand::ImmTyGDS, true, 0, nullptr}
-};
-
-static const OptionalOperand DSOptionalOpsOff01 [] = {
-  {"offset0", AMDGPUOperand::ImmTyDSOffset0, false, 0, nullptr},
-  {"offset1", AMDGPUOperand::ImmTyDSOffset1, false, 0, nullptr},
-  {"gds",     AMDGPUOperand::ImmTyGDS, true, 0, nullptr}
-};
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseDSOptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(DSOptionalOps, Operands);
-}
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseDSOff01OptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(DSOptionalOpsOff01, Operands);
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseDSOffsetOptional(OperandVector &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
-  AMDGPUAsmParser::OperandMatchResultTy Res =
-    parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset);
-  if (Res == MatchOperand_NoMatch) {
-    Operands.push_back(AMDGPUOperand::CreateImm(0, S,
-                       AMDGPUOperand::ImmTyOffset));
-    Res = MatchOperand_Success;
-  }
-  return Res;
-}
-
-bool AMDGPUOperand::isDSOffset() const {
-  return isImm() && isUInt<16>(getImm());
-}
-
-bool AMDGPUOperand::isDSOffset01() const {
-  return isImm() && isUInt<8>(getImm());
-}
-
-void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
-                                    const OperandVector &Operands) {
-
-  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
-
-  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
-    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
-
-    // Add the register arguments
-    if (Op.isReg()) {
-      Op.addRegOperands(Inst, 1);
-      continue;
-    }
-
-    // Handle optional arguments
-    OptionalIdx[Op.getImmTy()] = i;
-  }
-
-  unsigned Offset0Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset0];
-  unsigned Offset1Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset1];
-  unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS];
-
-  ((AMDGPUOperand &)*Operands[Offset0Idx]).addImmOperands(Inst, 1); // offset0
-  ((AMDGPUOperand &)*Operands[Offset1Idx]).addImmOperands(Inst, 1); // offset1
-  ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds
-  Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
-}
-
-void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
-
-  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
-  bool GDSOnly = false;
-
-  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
-    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
-
-    // Add the register arguments
-    if (Op.isReg()) {
-      Op.addRegOperands(Inst, 1);
-      continue;
-    }
-
-    if (Op.isToken() && Op.getToken() == "gds") {
-      GDSOnly = true;
-      continue;
-    }
-
-    // Handle optional arguments
-    OptionalIdx[Op.getImmTy()] = i;
-  }
-
-  unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset];
-  ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); // offset
-
-  if (!GDSOnly) {
-    unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS];
-    ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds
-  }
-  Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
-}
-
-
-//===----------------------------------------------------------------------===//
-// s_waitcnt
-//===----------------------------------------------------------------------===//
-
-bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
-  StringRef CntName = Parser.getTok().getString();
-  int64_t CntVal;
-
-  Parser.Lex();
-  if (getLexer().isNot(AsmToken::LParen))
-    return true;
-
-  Parser.Lex();
-  if (getLexer().isNot(AsmToken::Integer))
-    return true;
-
-  if (getParser().parseAbsoluteExpression(CntVal))
-    return true;
-
-  if (getLexer().isNot(AsmToken::RParen))
-    return true;
-
-  Parser.Lex();
-  if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma))
-    Parser.Lex();
-
-  int CntShift;
-  int CntMask;
-
-  if (CntName == "vmcnt") {
-    CntMask = 0xf;
-    CntShift = 0;
-  } else if (CntName == "expcnt") {
-    CntMask = 0x7;
-    CntShift = 4;
-  } else if (CntName == "lgkmcnt") {
-    CntMask = 0x7;
-    CntShift = 8;
-  } else {
-    return true;
-  }
-
-  IntVal &= ~(CntMask << CntShift);
-  IntVal |= (CntVal << CntShift);
-  return false;
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
-  // Disable all counters by default.
-  // vmcnt   [3:0]
-  // expcnt  [6:4]
-  // lgkmcnt [10:8]
-  int64_t CntVal = 0x77f;
-  SMLoc S = Parser.getTok().getLoc();
-
-  switch(getLexer().getKind()) {
-    default: return MatchOperand_ParseFail;
-    case AsmToken::Integer:
-      // The operand can be an integer value.
-      if (getParser().parseAbsoluteExpression(CntVal))
-        return MatchOperand_ParseFail;
-      break;
-
-    case AsmToken::Identifier:
-      do {
-        if (parseCnt(CntVal))
-          return MatchOperand_ParseFail;
-      } while(getLexer().isNot(AsmToken::EndOfStatement));
-      break;
-  }
-  Operands.push_back(AMDGPUOperand::CreateImm(CntVal, S));
-  return MatchOperand_Success;
-}
-
-bool AMDGPUOperand::isSWaitCnt() const {
-  return isImm();
-}
-
-//===----------------------------------------------------------------------===//
-// sopp branch targets
-//===----------------------------------------------------------------------===//
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
-
-  switch (getLexer().getKind()) {
-    default: return MatchOperand_ParseFail;
-    case AsmToken::Integer: {
-      int64_t Imm;
-      if (getParser().parseAbsoluteExpression(Imm))
-        return MatchOperand_ParseFail;
-      Operands.push_back(AMDGPUOperand::CreateImm(Imm, S));
-      return MatchOperand_Success;
-    }
-
-    case AsmToken::Identifier:
-      Operands.push_back(AMDGPUOperand::CreateExpr(
-          MCSymbolRefExpr::create(getContext().getOrCreateSymbol(
-                                  Parser.getTok().getString()), getContext()), S));
-      Parser.Lex();
-      return MatchOperand_Success;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// flat
-//===----------------------------------------------------------------------===//
-
-static const OptionalOperand FlatOptionalOps [] = {
-  {"glc",    AMDGPUOperand::ImmTyGLC, true, 0, nullptr},
-  {"slc",    AMDGPUOperand::ImmTySLC, true, 0, nullptr},
-  {"tfe",    AMDGPUOperand::ImmTyTFE, true, 0, nullptr}
-};
-
-static const OptionalOperand FlatAtomicOptionalOps [] = {
-  {"slc",    AMDGPUOperand::ImmTySLC, true, 0, nullptr},
-  {"tfe",    AMDGPUOperand::ImmTyTFE, true, 0, nullptr}
-};
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseFlatOptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(FlatOptionalOps, Operands);
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseFlatAtomicOptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(FlatAtomicOptionalOps, Operands);
-}
-
-void AMDGPUAsmParser::cvtFlat(MCInst &Inst,
-                               const OperandVector &Operands) {
-  std::map<AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
-
-  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
-    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
-
-    // Add the register arguments
-    if (Op.isReg()) {
-      Op.addRegOperands(Inst, 1);
-      continue;
-    }
-
-    // Handle 'glc' token which is sometimes hard-coded into the
-    // asm string.  There are no MCInst operands for these.
-    if (Op.isToken())
-      continue;
-
-    // Handle optional arguments
-    OptionalIdx[Op.getImmTy()] = i;
-
-  }
-
-  // flat atomic instructions don't have a glc argument.
-  if (OptionalIdx.count(AMDGPUOperand::ImmTyGLC)) {
-    unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC];
-    ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1);
-  }
-
-  unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC];
-  unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE];
-
-  ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1);
-  ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1);
-}
-
-//===----------------------------------------------------------------------===//
-// mubuf
-//===----------------------------------------------------------------------===//
-
-static const OptionalOperand MubufOptionalOps [] = {
-  {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr},
-  {"glc",    AMDGPUOperand::ImmTyGLC, true, 0, nullptr},
-  {"slc",    AMDGPUOperand::ImmTySLC, true, 0, nullptr},
-  {"tfe",    AMDGPUOperand::ImmTyTFE, true, 0, nullptr}
-};
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseMubufOptionalOps(OperandVector &Operands) {
-  return parseOptionalOps(MubufOptionalOps, Operands);
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseOffset(OperandVector &Operands) {
-  return parseIntWithPrefix("offset", Operands);
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseGLC(OperandVector &Operands) {
-  return parseNamedBit("glc", Operands);
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseSLC(OperandVector &Operands) {
-  return parseNamedBit("slc", Operands);
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseTFE(OperandVector &Operands) {
-  return parseNamedBit("tfe", Operands);
-}
-
-bool AMDGPUOperand::isMubufOffset() const {
-  return isImm() && isUInt<12>(getImm());
-}
-
-void AMDGPUAsmParser::cvtMubuf(MCInst &Inst,
-                               const OperandVector &Operands) {
-  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
-
-  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
-    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
-
-    // Add the register arguments
-    if (Op.isReg()) {
-      Op.addRegOperands(Inst, 1);
-      continue;
-    }
-
-    // Handle the case where soffset is an immediate
-    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
-      Op.addImmOperands(Inst, 1);
-      continue;
-    }
-
-    // Handle tokens like 'offen' which are sometimes hard-coded into the
-    // asm string.  There are no MCInst operands for these.
-    if (Op.isToken()) {
-      continue;
-    }
-    assert(Op.isImm());
-
-    // Handle optional arguments
-    OptionalIdx[Op.getImmTy()] = i;
-  }
-
-  assert(OptionalIdx.size() == 4);
-
-  unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset];
-  unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC];
-  unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC];
-  unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE];
-
-  ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1);
-  ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1);
-  ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1);
-  ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1);
-}
-
-//===----------------------------------------------------------------------===//
-// mimg
-//===----------------------------------------------------------------------===//
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseDMask(OperandVector &Operands) {
-  return parseIntWithPrefix("dmask", Operands);
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseUNorm(OperandVector &Operands) {
-  return parseNamedBit("unorm", Operands);
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseR128(OperandVector &Operands) {
-  return parseNamedBit("r128", Operands);
-}
-
-//===----------------------------------------------------------------------===//
-// vop3
-//===----------------------------------------------------------------------===//
-
-static bool ConvertOmodMul(int64_t &Mul) {
-  if (Mul != 1 && Mul != 2 && Mul != 4)
-    return false;
-
-  Mul >>= 1;
-  return true;
-}
-
-static bool ConvertOmodDiv(int64_t &Div) {
-  if (Div == 1) {
-    Div = 0;
-    return true;
-  }
-
-  if (Div == 2) {
-    Div = 3;
-    return true;
-  }
-
-  return false;
-}
-
-static const OptionalOperand VOP3OptionalOps [] = {
-  {"clamp", AMDGPUOperand::ImmTyClamp, true, 0, nullptr},
-  {"mul",   AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodMul},
-  {"div",   AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodDiv},
-};
-
-static bool isVOP3(OperandVector &Operands) {
-  if (operandsHaveModifiers(Operands))
-    return true;
-
-  AMDGPUOperand &DstOp = ((AMDGPUOperand&)*Operands[1]);
-
-  if (DstOp.isReg() && DstOp.isRegClass(AMDGPU::SGPR_64RegClassID))
-    return true;
-
-  if (Operands.size() >= 5)
-    return true;
-
-  if (Operands.size() > 3) {
-    AMDGPUOperand &Src1Op = ((AMDGPUOperand&)*Operands[3]);
-    if (Src1Op.getReg() && (Src1Op.isRegClass(AMDGPU::SReg_32RegClassID) ||
-                            Src1Op.isRegClass(AMDGPU::SReg_64RegClassID)))
-      return true;
-  }
-  return false;
-}
-
-AMDGPUAsmParser::OperandMatchResultTy
-AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) {
-
-  // The value returned by this function may change after parsing
-  // an operand so store the original value here.
-  bool HasModifiers = operandsHaveModifiers(Operands);
-
-  bool IsVOP3 = isVOP3(Operands);
-  if (HasModifiers || IsVOP3 ||
-      getLexer().isNot(AsmToken::EndOfStatement) ||
-      getForcedEncodingSize() == 64) {
-
-    AMDGPUAsmParser::OperandMatchResultTy Res =
-        parseOptionalOps(VOP3OptionalOps, Operands);
-
-    if (!HasModifiers && Res == MatchOperand_Success) {
-      // We have added a modifier operation, so we need to make sure all
-      // previous register operands have modifiers
-      for (unsigned i = 2, e = Operands.size(); i != e; ++i) {
-        AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]);
-        if (Op.isReg())
-          Op.setModifiers(0);
-      }
-    }
-    return Res;
-  }
-  return MatchOperand_NoMatch;
-}
-
-void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
-  ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1);
-  unsigned i = 2;
-
-  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
-
-  if (operandsHaveModifiers(Operands)) {
-    for (unsigned e = Operands.size(); i != e; ++i) {
-      AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
-
-      if (Op.isRegWithInputMods()) {
-        ((AMDGPUOperand &)*Operands[i]).addRegWithInputModsOperands(Inst, 2);
-        continue;
-      }
-      OptionalIdx[Op.getImmTy()] = i;
-    }
-
-    unsigned ClampIdx = OptionalIdx[AMDGPUOperand::ImmTyClamp];
-    unsigned OModIdx = OptionalIdx[AMDGPUOperand::ImmTyOMod];
-
-    ((AMDGPUOperand &)*Operands[ClampIdx]).addImmOperands(Inst, 1);
-    ((AMDGPUOperand &)*Operands[OModIdx]).addImmOperands(Inst, 1);
-  } else {
-    for (unsigned e = Operands.size(); i != e; ++i)
-      ((AMDGPUOperand &)*Operands[i]).addRegOrImmOperands(Inst, 1);
-  }
-}
-
-/// Force static initialization.
-extern "C" void LLVMInitializeR600AsmParser() {
-  RegisterMCAsmParser<AMDGPUAsmParser> A(TheAMDGPUTarget);
-  RegisterMCAsmParser<AMDGPUAsmParser> B(TheGCNTarget);
-}
-
-#define GET_REGISTER_MATCHER
-#define GET_MATCHER_IMPLEMENTATION
-#include "AMDGPUGenAsmMatcher.inc"
-
diff --git a/lib/Target/R600/AsmParser/CMakeLists.txt b/lib/Target/R600/AsmParser/CMakeLists.txt
deleted file mode 100644
index 1b42af73740..00000000000
--- a/lib/Target/R600/AsmParser/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_llvm_library(LLVMR600AsmParser
-  AMDGPUAsmParser.cpp
-  )
diff --git a/lib/Target/R600/AsmParser/LLVMBuild.txt b/lib/Target/R600/AsmParser/LLVMBuild.txt
deleted file mode 100644
index 940e4cee6df..00000000000
--- a/lib/Target/R600/AsmParser/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/R600/AsmParser/LLVMBuild.txt -------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = R600AsmParser
-parent = R600
-required_libraries = MC MCParser R600Desc R600Info Support
-add_to_library_groups = R600
diff --git a/lib/Target/R600/AsmParser/Makefile b/lib/Target/R600/AsmParser/Makefile
deleted file mode 100644
index e6689b54b6b..00000000000
--- a/lib/Target/R600/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/R600/AsmParser/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMR600AsmParser
-
-# Hack: we need to include 'main' R600 target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/R600/CIInstructions.td b/lib/Target/R600/CIInstructions.td
deleted file mode 100644
index 2f5fdbe9207..00000000000
--- a/lib/Target/R600/CIInstructions.td
+++ /dev/null
@@ -1,149 +0,0 @@
-//===-- CIInstructions.td - CI Instruction Defintions ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// Instruction definitions for CI and newer.
-//===----------------------------------------------------------------------===//
-
-
-def isCIVI : Predicate <
-  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
-  "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
->, AssemblerPredicate<"FeatureCIInsts">;
-
-def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
-
-//===----------------------------------------------------------------------===//
-// VOP1 Instructions
-//===----------------------------------------------------------------------===//
-
-let SubtargetPredicate = isCIVI in {
-
-defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64",
-  VOP_F64_F64, ftrunc
->;
-defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64",
-  VOP_F64_F64, fceil
->;
-defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64",
-  VOP_F64_F64, ffloor
->;
-defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64",
-  VOP_F64_F64, frint
->;
-defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32",
-  VOP_F32_F32
->;
-defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32",
-  VOP_F32_F32
->;
-
-//===----------------------------------------------------------------------===//
-// Flat Instructions
-//===----------------------------------------------------------------------===//
-
-def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x8, "flat_load_ubyte", VGPR_32>;
-def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x9, "flat_load_sbyte", VGPR_32>;
-def FLAT_LOAD_USHORT : FLAT_Load_Helper <0xa, "flat_load_ushort", VGPR_32>;
-def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0xb, "flat_load_sshort", VGPR_32>;
-def FLAT_LOAD_DWORD : FLAT_Load_Helper <0xc, "flat_load_dword", VGPR_32>;
-def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0xd, "flat_load_dwordx2", VReg_64>;
-def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0xe, "flat_load_dwordx4", VReg_128>;
-def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0xf, "flat_load_dwordx3", VReg_96>;
-def FLAT_STORE_BYTE : FLAT_Store_Helper <0x18, "flat_store_byte", VGPR_32>;
-def FLAT_STORE_SHORT : FLAT_Store_Helper <0x1a, "flat_store_short", VGPR_32>;
-def FLAT_STORE_DWORD : FLAT_Store_Helper <0x1c, "flat_store_dword", VGPR_32>;
-def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
-  0x1d, "flat_store_dwordx2", VReg_64
->;
-def FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
-  0x1e, "flat_store_dwordx4", VReg_128
->;
-def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
-  0x1f, "flat_store_dwordx3", VReg_96
->;
-defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <0x30, "flat_atomic_swap", VGPR_32>;
-defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC <
-  0x31, "flat_atomic_cmpswap", VGPR_32, VReg_64
->;
-defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <0x32, "flat_atomic_add", VGPR_32>;
-defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <0x33, "flat_atomic_sub", VGPR_32>;
-defm FLAT_ATOMIC_RSUB : FLAT_ATOMIC <0x34, "flat_atomic_rsub", VGPR_32>;
-defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <0x35, "flat_atomic_smin", VGPR_32>;
-defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <0x36, "flat_atomic_umin", VGPR_32>;
-defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <0x37, "flat_atomic_smax", VGPR_32>;
-defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <0x38, "flat_atomic_umax", VGPR_32>;
-defm FLAT_ATOMIC_AND : FLAT_ATOMIC <0x39, "flat_atomic_and", VGPR_32>;
-defm FLAT_ATOMIC_OR : FLAT_ATOMIC <0x3a, "flat_atomic_or", VGPR_32>;
-defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <0x3b, "flat_atomic_xor", VGPR_32>;
-defm FLAT_ATOMIC_INC : FLAT_ATOMIC <0x3c, "flat_atomic_inc", VGPR_32>;
-defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <0x3d, "flat_atomic_dec", VGPR_32>;
-defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
-  0x3e, "flat_atomic_fcmpswap", VGPR_32, VReg_64
->;
-defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <0x3f, "flat_atomic_fmin", VGPR_32>;
-defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <0x40, "flat_atomic_fmax", VGPR_32>;
-defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <0x50, "flat_atomic_swap_x2", VReg_64>;
-defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC <
-  0x51, "flat_atomic_cmpswap_x2", VReg_64, VReg_128
->;
-defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <0x52, "flat_atomic_add_x2", VReg_64>;
-defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <0x53, "flat_atomic_sub_x2", VReg_64>;
-defm FLAT_ATOMIC_RSUB_X2 : FLAT_ATOMIC <0x54, "flat_atomic_rsub_x2", VReg_64>;
-defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <0x55, "flat_atomic_smin_x2", VReg_64>;
-defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <0x56, "flat_atomic_umin_x2", VReg_64>;
-defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <0x57, "flat_atomic_smax_x2", VReg_64>;
-defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <0x58, "flat_atomic_umax_x2", VReg_64>;
-defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <0x59, "flat_atomic_and_x2", VReg_64>;
-defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <0x5a, "flat_atomic_or_x2", VReg_64>;
-defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <0x5b, "flat_atomic_xor_x2", VReg_64>;
-defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <0x5c, "flat_atomic_inc_x2", VReg_64>;
-defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <0x5d, "flat_atomic_dec_x2", VReg_64>;
-defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
-  0x5e, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128
->;
-defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <0x5f, "flat_atomic_fmin_x2", VReg_64>;
-defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <0x60, "flat_atomic_fmax_x2", VReg_64>;
-
-} // End SubtargetPredicate = isCIVI
-
-//===----------------------------------------------------------------------===//
-// Flat Patterns
-//===----------------------------------------------------------------------===//
-
-let Predicates = [HasFlatAddressSpace] in {
-
-class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt,
-                             PatFrag flat_ld> :
-  Pat <(vt (flat_ld i64:$ptr)),
-       (Instr_ADDR64 $ptr, 0, 0, 0)
->;
-
-def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>;
-
-class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> :
-  Pat <(st vt:$value, i64:$ptr),
-        (Instr $value, $ptr, 0, 0, 0)
-  >;
-
-def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>;
-def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>;
-def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
-
-} // End HasFlatAddressSpace predicate
-
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
deleted file mode 100644
index 3c1bc49f282..00000000000
--- a/lib/Target/R600/CMakeLists.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-set(LLVM_TARGET_DEFINITIONS AMDGPU.td)
-
-tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
-tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic)
-tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
-tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer)
-tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
-add_public_tablegen_target(AMDGPUCommonTableGen)
-
-add_llvm_target(R600CodeGen
-  AMDILCFGStructurizer.cpp
-  AMDGPUAlwaysInlinePass.cpp
-  AMDGPUAsmPrinter.cpp
-  AMDGPUFrameLowering.cpp
-  AMDGPUIntrinsicInfo.cpp
-  AMDGPUISelDAGToDAG.cpp
-  AMDGPUMCInstLower.cpp
-  AMDGPUMachineFunction.cpp
-  AMDGPUSubtarget.cpp
-  AMDGPUTargetMachine.cpp
-  AMDGPUTargetTransformInfo.cpp
-  AMDGPUISelLowering.cpp
-  AMDGPUInstrInfo.cpp
-  AMDGPUPromoteAlloca.cpp
-  AMDGPURegisterInfo.cpp
-  R600ClauseMergePass.cpp
-  R600ControlFlowFinalizer.cpp
-  R600EmitClauseMarkers.cpp
-  R600ExpandSpecialInstrs.cpp
-  R600InstrInfo.cpp
-  R600ISelLowering.cpp
-  R600MachineFunctionInfo.cpp
-  R600MachineScheduler.cpp
-  R600OptimizeVectorRegisters.cpp
-  R600Packetizer.cpp
-  R600RegisterInfo.cpp
-  R600TextureIntrinsicsReplacer.cpp
-  SIAnnotateControlFlow.cpp
-  SIFixControlFlowLiveIntervals.cpp
-  SIFixSGPRCopies.cpp
-  SIFixSGPRLiveRanges.cpp
-  SIFoldOperands.cpp
-  SIInsertWaits.cpp
-  SIInstrInfo.cpp
-  SIISelLowering.cpp
-  SILoadStoreOptimizer.cpp
-  SILowerControlFlow.cpp
-  SILowerI1Copies.cpp
-  SIMachineFunctionInfo.cpp
-  SIPrepareScratchRegs.cpp
-  SIRegisterInfo.cpp
-  SIShrinkInstructions.cpp
-  SITypeRewriter.cpp
-  )
-
-add_subdirectory(AsmParser)
-add_subdirectory(InstPrinter)
-add_subdirectory(TargetInfo)
-add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td
deleted file mode 100644
index ba4df82a6d3..00000000000
--- a/lib/Target/R600/CaymanInstructions.td
+++ /dev/null
@@ -1,226 +0,0 @@
-//===-- CaymanInstructions.td - CM Instruction defs  -------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TableGen definitions for instructions which are available only on Cayman
-// family GPUs.
-//
-//===----------------------------------------------------------------------===//
-
-def isCayman : Predicate<"Subtarget->hasCaymanISA()">;
-
-//===----------------------------------------------------------------------===//
-// Cayman Instructions
-//===----------------------------------------------------------------------===//
-
-let Predicates = [isCayman] in {
-
-def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
-  [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU
->;
-def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
-  [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU
->;
-
-def : IMad24Pat<MULADD_INT24_cm>;
-
-let isVector = 1 in {
-
-def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
-
-def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
-def MULHI_INT_cm : MULHI_INT_Common<0x90>;
-def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
-def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
-def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
-def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
-def LOG_IEEE_cm : LOG_IEEE_Common<0x83>;
-def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
-def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
-def SIN_cm : SIN_Common<0x8D>;
-def COS_cm : COS_Common<0x8E>;
-} // End isVector = 1
-
-def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
-
-def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
-
-defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
-defm : Expand24UBitOps<MULLO_UINT_cm, ADD_INT>;
-
-// RECIP_UINT emulation for Cayman
-// The multiplication scales from [0,1] to the unsigned integer range
-def : Pat <
-  (AMDGPUurecip i32:$src0),
-  (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)),
-                            (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1)))
->;
-
-  def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
-    let ADDR = 0;
-    let POP_COUNT = 0;
-    let COUNT = 0;
-  }
-
-
-def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
-
-class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
-  CF_MEM_RAT_CACHELESS <0x14, 0, mask,
-                        (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
-                        "STORE_DWORD $rw_gpr, $index_gpr",
-                        [(global_store vt:$rw_gpr, i32:$index_gpr)]> {
-  let eop = 0; // This bit is not used on Cayman.
-}
-
-def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>;
-def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>;
-def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>;
-
-class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
-    : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
-
-  // Static fields
-  let VC_INST = 0;
-  let FETCH_TYPE = 2;
-  let FETCH_WHOLE_QUAD = 0;
-  let BUFFER_ID = buffer_id;
-  let SRC_REL = 0;
-  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
-  // to store vertex addresses in any channel, not just X.
-  let SRC_SEL_X = 0;
-  let SRC_SEL_Y = 0;
-  let STRUCTURED_READ = 0;
-  let LDS_REQ = 0;
-  let COALESCED_READ = 0;
-
-  let Inst{31-0} = Word0;
-}
-
-class VTX_READ_8_cm <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id,
-                   (outs R600_TReg32_X:$dst_gpr), pattern> {
-
-  let DST_SEL_X = 0;
-  let DST_SEL_Y = 7;   // Masked
-  let DST_SEL_Z = 7;   // Masked
-  let DST_SEL_W = 7;   // Masked
-  let DATA_FORMAT = 1; // FMT_8
-}
-
-class VTX_READ_16_cm <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id,
-                   (outs R600_TReg32_X:$dst_gpr), pattern> {
-  let DST_SEL_X = 0;
-  let DST_SEL_Y = 7;   // Masked
-  let DST_SEL_Z = 7;   // Masked
-  let DST_SEL_W = 7;   // Masked
-  let DATA_FORMAT = 5; // FMT_16
-
-}
-
-class VTX_READ_32_cm <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id,
-                   (outs R600_TReg32_X:$dst_gpr), pattern> {
-
-  let DST_SEL_X        = 0;
-  let DST_SEL_Y        = 7;   // Masked
-  let DST_SEL_Z        = 7;   // Masked
-  let DST_SEL_W        = 7;   // Masked
-  let DATA_FORMAT      = 0xD; // COLOR_32
-
-  // This is not really necessary, but there were some GPU hangs that appeared
-  // to be caused by ALU instructions in the next instruction group that wrote
-  // to the $src_gpr registers of the VTX_READ.
-  // e.g.
-  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
-  // %T2_X<def> = MOV %ZERO
-  //Adding this constraint prevents this from happening.
-  let Constraints = "$src_gpr.ptr = $dst_gpr";
-}
-
-class VTX_READ_64_cm <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id,
-                   (outs R600_Reg64:$dst_gpr), pattern> {
-
-  let DST_SEL_X        = 0;
-  let DST_SEL_Y        = 1;
-  let DST_SEL_Z        = 7;
-  let DST_SEL_W        = 7;
-  let DATA_FORMAT      = 0x1D; // COLOR_32_32
-}
-
-class VTX_READ_128_cm <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
-                   (outs R600_Reg128:$dst_gpr), pattern> {
-
-  let DST_SEL_X        =  0;
-  let DST_SEL_Y        =  1;
-  let DST_SEL_Z        =  2;
-  let DST_SEL_W        =  3;
-  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
-
-  // XXX: Need to force VTX_READ_128 instructions to write to the same register
-  // that holds its buffer address to avoid potential hangs.  We can't use
-  // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
-  // registers are different sizes.
-}
-
-//===----------------------------------------------------------------------===//
-// VTX Read from parameter memory space
-//===----------------------------------------------------------------------===//
-def VTX_READ_PARAM_8_cm : VTX_READ_8_cm <0,
-  [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_16_cm : VTX_READ_16_cm <0,
-  [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0,
-  [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0,
-  [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0,
-  [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-//===----------------------------------------------------------------------===//
-// VTX Read from global memory space
-//===----------------------------------------------------------------------===//
-
-// 8-bit reads
-def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1,
-  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1,
-  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
->;
-
-// 32-bit reads
-def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1,
-  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 64-bit reads
-def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1,
-  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 128-bit reads
-def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1,
-  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
->;
-
-} // End isCayman
-
diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
deleted file mode 100644
index 7adcd46fe19..00000000000
--- a/lib/Target/R600/EvergreenInstructions.td
+++ /dev/null
@@ -1,670 +0,0 @@
-//===-- EvergreenInstructions.td - EG Instruction defs  ----*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TableGen definitions for instructions which are:
-// - Available to Evergreen and newer VLIW4/VLIW5 GPUs
-// - Available only on Evergreen family GPUs.
-//
-//===----------------------------------------------------------------------===//
-
-def isEG : Predicate<
-  "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
-  "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-  "!Subtarget->hasCaymanISA()"
->;
-
-def isEGorCayman : Predicate<
-  "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||"
-  "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS"
->;
-
-//===----------------------------------------------------------------------===//
-// Evergreen / Cayman store instructions
-//===----------------------------------------------------------------------===//
-
-let Predicates = [isEGorCayman] in {
-
-class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins,
-                           string name, list<dag> pattern>
-    : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins,
-                 "MEM_RAT_CACHELESS "#name, pattern>;
-
-class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
-                  list<dag> pattern>
-    : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
-                 "MEM_RAT "#name, pattern>;
-
-def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
-  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
-  "MSKOR $rw_gpr.XW, $index_gpr",
-  [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)]
-> {
-  let eop = 0;
-}
-
-} // End let Predicates = [isEGorCayman]
-
-//===----------------------------------------------------------------------===//
-// Evergreen Only instructions
-//===----------------------------------------------------------------------===//
-
-let Predicates = [isEG] in {
-
-def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
-defm DIV_eg : DIV_Common<RECIP_IEEE_eg>;
-
-def MULLO_INT_eg : MULLO_INT_Common<0x8F>;
-def MULHI_INT_eg : MULHI_INT_Common<0x90>;
-def MULLO_UINT_eg : MULLO_UINT_Common<0x91>;
-def MULHI_UINT_eg : MULHI_UINT_Common<0x92>;
-def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
-def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
-def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
-def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
-def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
-def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
-def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
-def SIN_eg : SIN_Common<0x8D>;
-def COS_eg : COS_Common<0x8E>;
-
-def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
-def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
-
-defm : Expand24IBitOps<MULLO_INT_eg, ADD_INT>;
-
-//===----------------------------------------------------------------------===//
-// Memory read/write instructions
-//===----------------------------------------------------------------------===//
-
-let usesCustomInserter = 1 in {
-
-// 32-bit store
-def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1,
-  (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  "STORE_RAW $rw_gpr, $index_gpr, $eop",
-  [(global_store i32:$rw_gpr, i32:$index_gpr)]
->;
-
-// 64-bit store
-def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3,
-  (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  "STORE_RAW $rw_gpr.XY, $index_gpr, $eop",
-  [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
->;
-
-//128-bit store
-def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
-  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop",
-  [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
->;
-
-} // End usesCustomInserter = 1
-
-class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
-    : VTX_WORD0_eg, VTX_READ<name, buffer_id, outs, pattern> {
-
-  // Static fields
-  let VC_INST = 0;
-  let FETCH_TYPE = 2;
-  let FETCH_WHOLE_QUAD = 0;
-  let BUFFER_ID = buffer_id;
-  let SRC_REL = 0;
-  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
-  // to store vertex addresses in any channel, not just X.
-  let SRC_SEL_X = 0;
-
-  let Inst{31-0} = Word0;
-}
-
-class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id,
-                   (outs R600_TReg32_X:$dst_gpr), pattern> {
-
-  let MEGA_FETCH_COUNT = 1;
-  let DST_SEL_X = 0;
-  let DST_SEL_Y = 7;   // Masked
-  let DST_SEL_Z = 7;   // Masked
-  let DST_SEL_W = 7;   // Masked
-  let DATA_FORMAT = 1; // FMT_8
-}
-
-class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id,
-                   (outs R600_TReg32_X:$dst_gpr), pattern> {
-  let MEGA_FETCH_COUNT = 2;
-  let DST_SEL_X = 0;
-  let DST_SEL_Y = 7;   // Masked
-  let DST_SEL_Z = 7;   // Masked
-  let DST_SEL_W = 7;   // Masked
-  let DATA_FORMAT = 5; // FMT_16
-
-}
-
-class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id,
-                   (outs R600_TReg32_X:$dst_gpr), pattern> {
-
-  let MEGA_FETCH_COUNT = 4;
-  let DST_SEL_X        = 0;
-  let DST_SEL_Y        = 7;   // Masked
-  let DST_SEL_Z        = 7;   // Masked
-  let DST_SEL_W        = 7;   // Masked
-  let DATA_FORMAT      = 0xD; // COLOR_32
-
-  // This is not really necessary, but there were some GPU hangs that appeared
-  // to be caused by ALU instructions in the next instruction group that wrote
-  // to the $src_gpr registers of the VTX_READ.
-  // e.g.
-  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
-  // %T2_X<def> = MOV %ZERO
-  //Adding this constraint prevents this from happening.
-  let Constraints = "$src_gpr.ptr = $dst_gpr";
-}
-
-class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id,
-                   (outs R600_Reg64:$dst_gpr), pattern> {
-
-  let MEGA_FETCH_COUNT = 8;
-  let DST_SEL_X        = 0;
-  let DST_SEL_Y        = 1;
-  let DST_SEL_Z        = 7;
-  let DST_SEL_W        = 7;
-  let DATA_FORMAT      = 0x1D; // COLOR_32_32
-}
-
-class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
-                   (outs R600_Reg128:$dst_gpr), pattern> {
-
-  let MEGA_FETCH_COUNT = 16;
-  let DST_SEL_X        =  0;
-  let DST_SEL_Y        =  1;
-  let DST_SEL_Z        =  2;
-  let DST_SEL_W        =  3;
-  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
-
-  // XXX: Need to force VTX_READ_128 instructions to write to the same register
-  // that holds its buffer address to avoid potential hangs.  We can't use
-  // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
-  // registers are different sizes.
-}
-
-//===----------------------------------------------------------------------===//
-// VTX Read from parameter memory space
-//===----------------------------------------------------------------------===//
-
-def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
-  [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
-  [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
-  [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
-  [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
-  [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-//===----------------------------------------------------------------------===//
-// VTX Read from global memory space
-//===----------------------------------------------------------------------===//
-
-// 8-bit reads
-def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
-  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1,
-  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
->;
-
-// 32-bit reads
-def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
-  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 64-bit reads
-def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
-  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 128-bit reads
-def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
-  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
->;
-
-} // End Predicates = [isEG]
-
-//===----------------------------------------------------------------------===//
-// Evergreen / Cayman Instructions
-//===----------------------------------------------------------------------===//
-
-let Predicates = [isEGorCayman] in {
-
-// Should be predicated on FeatureFP64
-// def FMA_64 : R600_3OP <
-//   0xA, "FMA_64",
-//   [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
-// >;
-
-// BFE_UINT - bit_extract, an optimization for mask and shift
-// Src0 = Input
-// Src1 = Offset
-// Src2 = Width
-//
-// bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width)
-//
-// Example Usage:
-// (Offset, Width)
-//
-// (0, 8)  = (Input << 24) >> 24 = (Input &  0xff)       >> 0
-// (8, 8)  = (Input << 16) >> 24 = (Input &  0xffff)     >> 8
-// (16, 8) = (Input <<  8) >> 24 = (Input &  0xffffff)   >> 16
-// (24, 8) = (Input <<  0) >> 24 = (Input &  0xffffffff) >> 24
-def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
-  [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))],
-  VecALU
->;
-
-def BFE_INT_eg : R600_3OP <0x5, "BFE_INT",
-  [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))],
-  VecALU
->;
-
-def : BFEPattern <BFE_UINT_eg, MOV_IMM_I32>;
-
-def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
-  [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))],
-  VecALU
->;
-
-def : Pat<(i32 (sext_inreg i32:$src, i1)),
-  (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>;
-def : Pat<(i32 (sext_inreg i32:$src, i8)),
-  (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>;
-def : Pat<(i32 (sext_inreg i32:$src, i16)),
-  (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
-
-defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32, R600_Reg64>;
-
-def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
-  [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))],
-  VecALU
->;
-
-def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
-  [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU
->;
-
-def : UMad24Pat<MULADD_UINT24_eg>;
-
-def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
-def : ROTRPattern <BIT_ALIGN_INT_eg>;
-def MULADD_eg : MULADD_Common<0x14>;
-def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
-def FMA_eg : FMA_Common<0x7>;
-def ASHR_eg : ASHR_Common<0x15>;
-def LSHR_eg : LSHR_Common<0x16>;
-def LSHL_eg : LSHL_Common<0x17>;
-def CNDE_eg : CNDE_Common<0x19>;
-def CNDGT_eg : CNDGT_Common<0x1A>;
-def CNDGE_eg : CNDGE_Common<0x1B>;
-def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
-def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
-def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
-  [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU
->;
-def DOT4_eg : DOT4_Common<0xBE>;
-defm CUBE_eg : CUBE_Common<0xC0>;
-
-def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
-
-def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
-def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
-
-def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>;
-def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
-
-let hasSideEffects = 1 in {
-  def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
-}
-
-def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
-
-def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
-  let Pattern = [];
-  let Itinerary = AnyALU;
-}
-
-def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
-
-def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
-  let Pattern = [];
-}
-
-def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
-
-def GROUP_BARRIER : InstR600 <
-    (outs), (ins), "  GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>,
-    R600ALU_Word0,
-    R600ALU_Word1_OP2 <0x54> {
-
-  let dst = 0;
-  let dst_rel = 0;
-  let src0 = 0;
-  let src0_rel = 0;
-  let src0_neg = 0;
-  let src0_abs = 0;
-  let src1 = 0;
-  let src1_rel = 0;
-  let src1_neg = 0;
-  let src1_abs = 0;
-  let write = 0;
-  let omod = 0;
-  let clamp = 0;
-  let last = 1;
-  let bank_swizzle = 0;
-  let pred_sel = 0;
-  let update_exec_mask = 0;
-  let update_pred = 0;
-
-  let Inst{31-0}  = Word0;
-  let Inst{63-32} = Word1;
-
-  let ALUInst = 1;
-}
-
-def : Pat <
-	(int_AMDGPU_barrier_global),
-	(GROUP_BARRIER)
->;
-
-//===----------------------------------------------------------------------===//
-// LDS Instructions
-//===----------------------------------------------------------------------===//
-class R600_LDS  <bits<6> op, dag outs, dag ins, string asm,
-                 list<dag> pattern = []> :
-
-    InstR600 <outs, ins, asm, pattern, XALU>,
-    R600_ALU_LDS_Word0,
-    R600LDS_Word1 {
-
-  bits<6>  offset = 0;
-  let lds_op = op;
-
-  let Word1{27} = offset{0};
-  let Word1{12} = offset{1};
-  let Word1{28} = offset{2};
-  let Word1{31} = offset{3};
-  let Word0{12} = offset{4};
-  let Word0{25} = offset{5};
-
-
-  let Inst{31-0}  = Word0;
-  let Inst{63-32} = Word1;
-
-  let ALUInst = 1;
-  let HasNativeOperands = 1;
-  let UseNamedOperandTable = 1;
-}
-
-class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
-  lds_op,
-  (outs R600_Reg32:$dst),
-  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
-       LAST:$last, R600_Pred:$pred_sel,
-       BANK_SWIZZLE:$bank_swizzle),
-  "  "#name#" $last OQAP, $src0$src0_rel $pred_sel",
-  pattern
-  > {
-
-  let src1 = 0;
-  let src1_rel = 0;
-  let src2 = 0;
-  let src2_rel = 0;
-
-  let usesCustomInserter = 1;
-  let LDS_1A = 1;
-  let DisableEncoding = "$dst";
-}
-
-class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
-                     string dst =""> :
-    R600_LDS <
-  lds_op, outs,
-  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
-       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
-       LAST:$last, R600_Pred:$pred_sel,
-       BANK_SWIZZLE:$bank_swizzle),
-  "  "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel",
-  pattern
-  > {
-
-  field string BaseOp;
-
-  let src2 = 0;
-  let src2_rel = 0;
-  let LDS_1A1D = 1;
-}
-
-class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
-    R600_LDS_1A1D <lds_op, (outs), name, pattern> {
-  let BaseOp = name;
-}
-
-class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
-    R600_LDS_1A1D <lds_op,  (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> {
-
-  let BaseOp = name;
-  let usesCustomInserter = 1;
-  let DisableEncoding = "$dst";
-}
-
-class R600_LDS_1A2D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
-                     string dst =""> :
-    R600_LDS <
-  lds_op, outs,
-  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
-       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
-       R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel,
-       LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle),
-  "  "#name# "$last "#dst#"$src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel",
-  pattern> {
-
-  field string BaseOp;
-
-  let LDS_1A1D = 0;
-  let LDS_1A2D = 1;
-}
-
-class R600_LDS_1A2D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
-    R600_LDS_1A2D <lds_op, (outs), name, pattern> {
-  let BaseOp = name;
-}
-
-class R600_LDS_1A2D_RET <bits<6> lds_op, string name, list<dag> pattern> :
-    R600_LDS_1A2D <lds_op, (outs R600_Reg32:$dst), name, pattern> {
-
-  let BaseOp = name;
-  let usesCustomInserter = 1;
-  let DisableEncoding = "$dst";
-}
-
-def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
-def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >;
-def LDS_AND : R600_LDS_1A1D_NORET <0x9, "LDS_AND", [] >;
-def LDS_OR : R600_LDS_1A1D_NORET <0xa, "LDS_OR", [] >;
-def LDS_XOR : R600_LDS_1A1D_NORET <0xb, "LDS_XOR", [] >;
-def LDS_WRXCHG: R600_LDS_1A1D_NORET <0xd, "LDS_WRXCHG", [] >;
-def LDS_CMPST: R600_LDS_1A2D_NORET <0x10, "LDS_CMPST", [] >;
-def LDS_MIN_INT : R600_LDS_1A1D_NORET <0x5, "LDS_MIN_INT", [] >;
-def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >;
-def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >;
-def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >;
-def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE",
-  [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
->;
-def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE",
-  [(truncstorei8_local i32:$src1, i32:$src0)]
->;
-def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
-  [(truncstorei16_local i32:$src1, i32:$src0)]
->;
-def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
-  [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))]
->;
-def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
-  [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
->;
-def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND",
-  [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))]
->;
-def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR",
-  [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))]
->;
-def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR",
-  [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))]
->;
-def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT",
-  [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))]
->;
-def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT",
-  [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))]
->;
-def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT",
-  [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))]
->;
-def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT",
-  [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))]
->;
-def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG",
-  [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))]
->;
-def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST",
-  [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))]
->;
-def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
-  [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
->;
-def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET",
-  [(set i32:$dst, (sextloadi8_local i32:$src0))]
->;
-def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET",
-  [(set i32:$dst, (az_extloadi8_local i32:$src0))]
->;
-def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET",
-  [(set i32:$dst, (sextloadi16_local i32:$src0))]
->;
-def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET",
-  [(set i32:$dst, (az_extloadi16_local i32:$src0))]
->;
-
-// TRUNC is used for the FLT_TO_INT instructions to work around a
-// perceived problem where the rounding modes are applied differently
-// depending on the instruction and the slot they are in.
-// See:
-// https://bugs.freedesktop.org/show_bug.cgi?id=50232
-// Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c
-//
-// XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes,
-// which do not need to be truncated since the fp values are 0.0f or 1.0f.
-// We should look into handling these cases separately.
-def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
-
-def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
-
-// SHA-256 Patterns
-def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
-
-def EG_ExportSwz : ExportSwzInst {
-  let Word1{19-16} = 0; // BURST_COUNT
-  let Word1{20} = 0; // VALID_PIXEL_MODE
-  let Word1{21} = eop;
-  let Word1{29-22} = inst;
-  let Word1{30} = 0; // MARK
-  let Word1{31} = 1; // BARRIER
-}
-defm : ExportPattern<EG_ExportSwz, 83>;
-
-def EG_ExportBuf : ExportBufInst {
-  let Word1{19-16} = 0; // BURST_COUNT
-  let Word1{20} = 0; // VALID_PIXEL_MODE
-  let Word1{21} = eop;
-  let Word1{29-22} = inst;
-  let Word1{30} = 0; // MARK
-  let Word1{31} = 1; // BARRIER
-}
-defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>;
-
-def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT),
-  "TEX $COUNT @$ADDR"> {
-  let POP_COUNT = 0;
-}
-def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT),
-  "VTX $COUNT @$ADDR"> {
-  let POP_COUNT = 0;
-}
-def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR),
-  "LOOP_START_DX10 @$ADDR"> {
-  let POP_COUNT = 0;
-  let COUNT = 0;
-}
-def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
-  let POP_COUNT = 0;
-  let COUNT = 0;
-}
-def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR),
-  "LOOP_BREAK @$ADDR"> {
-  let POP_COUNT = 0;
-  let COUNT = 0;
-}
-def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR),
-  "CONTINUE @$ADDR"> {
-  let POP_COUNT = 0;
-  let COUNT = 0;
-}
-def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
-  "JUMP @$ADDR POP:$POP_COUNT"> {
-  let COUNT = 0;
-}
-def CF_PUSH_EG : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
-                              "PUSH @$ADDR POP:$POP_COUNT"> {
-  let COUNT = 0;
-}
-def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
-  "ELSE @$ADDR POP:$POP_COUNT"> {
-  let COUNT = 0;
-}
-def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> {
-  let ADDR = 0;
-  let COUNT = 0;
-  let POP_COUNT = 0;
-}
-def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
-  "POP @$ADDR POP:$POP_COUNT"> {
-  let COUNT = 0;
-}
-def CF_END_EG :  CF_CLAUSE_EG<0, (ins), "CF_END"> {
-  let COUNT = 0;
-  let POP_COUNT = 0;
-  let ADDR = 0;
-  let END_OF_PROGRAM = 1;
-}
-
-} // End Predicates = [isEGorCayman]
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
deleted file mode 100644
index e811d5cff22..00000000000
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ /dev/null
@@ -1,642 +0,0 @@
-//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-// \file
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUInstPrinter.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIDefines.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/MathExtras.h"
-
-using namespace llvm;
-
-void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
-                                  StringRef Annot, const MCSubtargetInfo &STI) {
-  OS.flush();
-  printInstruction(MI, OS);
-
-  printAnnotation(OS, Annot);
-}
-
-void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
-                                           raw_ostream &O) {
-  O << formatHex(MI->getOperand(OpNo).getImm() & 0xff);
-}
-
-void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
-                                           raw_ostream &O) {
-  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff);
-}
-
-void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
-                                           raw_ostream &O) {
-  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
-}
-
-void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  O << formatDec(MI->getOperand(OpNo).getImm() & 0xff);
-}
-
-void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
-                                              raw_ostream &O) {
-  O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
-}
-
-void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo,
-                                   raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " offen";
-}
-
-void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo,
-                                   raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " idxen";
-}
-
-void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo,
-                                    raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " addr64";
-}
-
-void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm()) {
-    O << " offset:";
-    printU16ImmDecOperand(MI, OpNo, O);
-  }
-}
-
-void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo,
-                                      raw_ostream &O) {
-  uint16_t Imm = MI->getOperand(OpNo).getImm();
-  if (Imm != 0) {
-    O << " offset:";
-    printU16ImmDecOperand(MI, OpNo, O);
-  }
-}
-
-void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm()) {
-    O << " offset0:";
-    printU8ImmDecOperand(MI, OpNo, O);
-  }
-}
-
-void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm()) {
-    O << " offset1:";
-    printU8ImmDecOperand(MI, OpNo, O);
-  }
-}
-
-void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo,
-                                 raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " gds";
-}
-
-void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo,
-                                 raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " glc";
-}
-
-void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo,
-                                 raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " slc";
-}
-
-void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo,
-                                 raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " tfe";
-}
-
-void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O,
-                                        const MCRegisterInfo &MRI) {
-  switch (reg) {
-  case AMDGPU::VCC:
-    O << "vcc";
-    return;
-  case AMDGPU::SCC:
-    O << "scc";
-    return;
-  case AMDGPU::EXEC:
-    O << "exec";
-    return;
-  case AMDGPU::M0:
-    O << "m0";
-    return;
-  case AMDGPU::FLAT_SCR:
-    O << "flat_scratch";
-    return;
-  case AMDGPU::VCC_LO:
-    O << "vcc_lo";
-    return;
-  case AMDGPU::VCC_HI:
-    O << "vcc_hi";
-    return;
-  case AMDGPU::EXEC_LO:
-    O << "exec_lo";
-    return;
-  case AMDGPU::EXEC_HI:
-    O << "exec_hi";
-    return;
-  case AMDGPU::FLAT_SCR_LO:
-    O << "flat_scratch_lo";
-    return;
-  case AMDGPU::FLAT_SCR_HI:
-    O << "flat_scratch_hi";
-    return;
-  default:
-    break;
-  }
-
-  char Type;
-  unsigned NumRegs;
-
-  if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) {
-    Type = 'v';
-    NumRegs = 1;
-  } else  if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) {
-    Type = 's';
-    NumRegs = 1;
-  } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) {
-    Type = 'v';
-    NumRegs = 2;
-  } else  if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) {
-    Type = 's';
-    NumRegs = 2;
-  } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) {
-    Type = 'v';
-    NumRegs = 4;
-  } else  if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) {
-    Type = 's';
-    NumRegs = 4;
-  } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) {
-    Type = 'v';
-    NumRegs = 3;
-  } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) {
-    Type = 'v';
-    NumRegs = 8;
-  } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) {
-    Type = 's';
-    NumRegs = 8;
-  } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) {
-    Type = 'v';
-    NumRegs = 16;
-  } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) {
-    Type = 's';
-    NumRegs = 16;
-  } else {
-    O << getRegisterName(reg);
-    return;
-  }
-
-  // The low 8 bits of the encoding value is the register index, for both VGPRs
-  // and SGPRs.
-  unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1);
-  if (NumRegs == 1) {
-    O << Type << RegIdx;
-    return;
-  }
-
-  O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
-}
-
-void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
-                                    raw_ostream &O) {
-  if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3)
-    O << "_e64 ";
-  else
-    O << "_e32 ";
-
-  printOperand(MI, OpNo, O);
-}
-
-void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) {
-  int32_t SImm = static_cast<int32_t>(Imm);
-  if (SImm >= -16 && SImm <= 64) {
-    O << SImm;
-    return;
-  }
-
-  if (Imm == FloatToBits(0.0f))
-    O << "0.0";
-  else if (Imm == FloatToBits(1.0f))
-    O << "1.0";
-  else if (Imm == FloatToBits(-1.0f))
-    O << "-1.0";
-  else if (Imm == FloatToBits(0.5f))
-    O << "0.5";
-  else if (Imm == FloatToBits(-0.5f))
-    O << "-0.5";
-  else if (Imm == FloatToBits(2.0f))
-    O << "2.0";
-  else if (Imm == FloatToBits(-2.0f))
-    O << "-2.0";
-  else if (Imm == FloatToBits(4.0f))
-    O << "4.0";
-  else if (Imm == FloatToBits(-4.0f))
-    O << "-4.0";
-  else
-    O << formatHex(static_cast<uint64_t>(Imm));
-}
-
-void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) {
-  int64_t SImm = static_cast<int64_t>(Imm);
-  if (SImm >= -16 && SImm <= 64) {
-    O << SImm;
-    return;
-  }
-
-  if (Imm == DoubleToBits(0.0))
-    O << "0.0";
-  else if (Imm == DoubleToBits(1.0))
-    O << "1.0";
-  else if (Imm == DoubleToBits(-1.0))
-    O << "-1.0";
-  else if (Imm == DoubleToBits(0.5))
-    O << "0.5";
-  else if (Imm == DoubleToBits(-0.5))
-    O << "-0.5";
-  else if (Imm == DoubleToBits(2.0))
-    O << "2.0";
-  else if (Imm == DoubleToBits(-2.0))
-    O << "-2.0";
-  else if (Imm == DoubleToBits(4.0))
-    O << "4.0";
-  else if (Imm == DoubleToBits(-4.0))
-    O << "-4.0";
-  else
-    llvm_unreachable("64-bit literal constants not supported");
-}
-
-void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                     raw_ostream &O) {
-
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isReg()) {
-    switch (Op.getReg()) {
-    // This is the default predicate state, so we don't need to print it.
-    case AMDGPU::PRED_SEL_OFF:
-      break;
-
-    default:
-      printRegOperand(Op.getReg(), O, MRI);
-      break;
-    }
-  } else if (Op.isImm()) {
-    const MCInstrDesc &Desc = MII.get(MI->getOpcode());
-    int RCID = Desc.OpInfo[OpNo].RegClass;
-    if (RCID != -1) {
-      const MCRegisterClass &ImmRC = MRI.getRegClass(RCID);
-      if (ImmRC.getSize() == 4)
-        printImmediate32(Op.getImm(), O);
-      else if (ImmRC.getSize() == 8)
-        printImmediate64(Op.getImm(), O);
-      else
-        llvm_unreachable("Invalid register class size");
-    } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) {
-      printImmediate32(Op.getImm(), O);
-    } else {
-      // We hit this for the immediate instruction bits that don't yet have a
-      // custom printer.
-      // TODO: Eventually this should be unnecessary.
-      O << formatDec(Op.getImm());
-    }
-  } else if (Op.isFPImm()) {
-    // We special case 0.0 because otherwise it will be printed as an integer.
-    if (Op.getFPImm() == 0.0)
-      O << "0.0";
-    else {
-      const MCInstrDesc &Desc = MII.get(MI->getOpcode());
-      const MCRegisterClass &ImmRC = MRI.getRegClass(Desc.OpInfo[OpNo].RegClass);
-
-      if (ImmRC.getSize() == 4)
-        printImmediate32(FloatToBits(Op.getFPImm()), O);
-      else if (ImmRC.getSize() == 8)
-        printImmediate64(DoubleToBits(Op.getFPImm()), O);
-      else
-        llvm_unreachable("Invalid register class size");
-    }
-  } else if (Op.isExpr()) {
-    const MCExpr *Exp = Op.getExpr();
-    Exp->print(O, &MAI);
-  } else {
-    llvm_unreachable("unknown operand type in printOperand");
-  }
-}
-
-void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  unsigned InputModifiers = MI->getOperand(OpNo).getImm();
-  if (InputModifiers & SISrcMods::NEG)
-    O << '-';
-  if (InputModifiers & SISrcMods::ABS)
-    O << '|';
-  printOperand(MI, OpNo + 1, O);
-  if (InputModifiers & SISrcMods::ABS)
-    O << '|';
-}
-
-void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
-                                        raw_ostream &O) {
-  unsigned Imm = MI->getOperand(OpNum).getImm();
-
-  if (Imm == 2) {
-    O << "P0";
-  } else if (Imm == 1) {
-    O << "P20";
-  } else if (Imm == 0) {
-    O << "P10";
-  } else {
-    llvm_unreachable("Invalid interpolation parameter slot");
-  }
-}
-
-void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
-  printOperand(MI, OpNo, O);
-  O  << ", ";
-  printOperand(MI, OpNo + 1, O);
-}
-
-void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
-                                   raw_ostream &O, StringRef Asm,
-                                   StringRef Default) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  assert(Op.isImm());
-  if (Op.getImm() == 1) {
-    O << Asm;
-  } else {
-    O << Default;
-  }
-}
-
-void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
-                                 raw_ostream &O) {
-  printIfSet(MI, OpNo, O, "|");
-}
-
-void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
-                                   raw_ostream &O) {
-  printIfSet(MI, OpNo, O, "_SAT");
-}
-
-void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo,
-                                     raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm())
-    O << " clamp";
-}
-
-void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
-                                     raw_ostream &O) {
-  int Imm = MI->getOperand(OpNo).getImm();
-  if (Imm == SIOutMods::MUL2)
-    O << " mul:2";
-  else if (Imm == SIOutMods::MUL4)
-    O << " mul:4";
-  else if (Imm == SIOutMods::DIV2)
-    O << " div:2";
-}
-
-void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
-                                     raw_ostream &O) {
-  int32_t Imm = MI->getOperand(OpNo).getImm();
-  O << Imm << '(' << BitsToFloat(Imm) << ')';
-}
-
-void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
-                                  raw_ostream &O) {
-  printIfSet(MI, OpNo, O, "*", " ");
-}
-
-void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
-                                 raw_ostream &O) {
-  printIfSet(MI, OpNo, O, "-");
-}
-
-void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
-                                  raw_ostream &O) {
-  switch (MI->getOperand(OpNo).getImm()) {
-  default: break;
-  case 1:
-    O << " * 2.0";
-    break;
-  case 2:
-    O << " * 4.0";
-    break;
-  case 3:
-    O << " / 2.0";
-    break;
-  }
-}
-
-void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
-                                 raw_ostream &O) {
-  printIfSet(MI, OpNo, O, "+");
-}
-
-void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  printIfSet(MI, OpNo, O, "ExecMask,");
-}
-
-void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
-  printIfSet(MI, OpNo, O, "Pred,");
-}
-
-void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
-                                       raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.getImm() == 0) {
-    O << " (MASKED)";
-  }
-}
-
-void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
-                                  raw_ostream &O) {
-  const char * chans = "XYZW";
-  int sel = MI->getOperand(OpNo).getImm();
-
-  int chan = sel & 3;
-  sel >>= 2;
-
-  if (sel >= 512) {
-    sel -= 512;
-    int cb = sel >> 12;
-    sel &= 4095;
-    O << cb << '[' << sel << ']';
-  } else if (sel >= 448) {
-    sel -= 448;
-    O << sel;
-  } else if (sel >= 0){
-    O << sel;
-  }
-
-  if (sel >= 0)
-    O << '.' << chans[chan];
-}
-
-void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
-                                         raw_ostream &O) {
-  int BankSwizzle = MI->getOperand(OpNo).getImm();
-  switch (BankSwizzle) {
-  case 1:
-    O << "BS:VEC_021/SCL_122";
-    break;
-  case 2:
-    O << "BS:VEC_120/SCL_212";
-    break;
-  case 3:
-    O << "BS:VEC_102/SCL_221";
-    break;
-  case 4:
-    O << "BS:VEC_201";
-    break;
-  case 5:
-    O << "BS:VEC_210";
-    break;
-  default:
-    break;
-  }
-  return;
-}
-
-void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
-                                  raw_ostream &O) {
-  unsigned Sel = MI->getOperand(OpNo).getImm();
-  switch (Sel) {
-  case 0:
-    O << 'X';
-    break;
-  case 1:
-    O << 'Y';
-    break;
-  case 2:
-    O << 'Z';
-    break;
-  case 3:
-    O << 'W';
-    break;
-  case 4:
-    O << '0';
-    break;
-  case 5:
-    O << '1';
-    break;
-  case 7:
-    O << '_';
-    break;
-  default:
-    break;
-  }
-}
-
-void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
-                                  raw_ostream &O) {
-  unsigned CT = MI->getOperand(OpNo).getImm();
-  switch (CT) {
-  case 0:
-    O << 'U';
-    break;
-  case 1:
-    O << 'N';
-    break;
-  default:
-    break;
-  }
-}
-
-void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
-                                    raw_ostream &O) {
-  int KCacheMode = MI->getOperand(OpNo).getImm();
-  if (KCacheMode > 0) {
-    int KCacheBank = MI->getOperand(OpNo - 2).getImm();
-    O << "CB" << KCacheBank << ':';
-    int KCacheAddr = MI->getOperand(OpNo + 2).getImm();
-    int LineSize = (KCacheMode == 1) ? 16 : 32;
-    O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize;
-  }
-}
-
-void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
-                                     raw_ostream &O) {
-  unsigned SImm16 = MI->getOperand(OpNo).getImm();
-  unsigned Msg = SImm16 & 0xF;
-  if (Msg == 2 || Msg == 3) {
-    unsigned Op = (SImm16 >> 4) & 0xF;
-    if (Msg == 3)
-      O << "Gs_done(";
-    else
-      O << "Gs(";
-    if (Op == 0) {
-      O << "nop";
-    } else {
-      unsigned Stream = (SImm16 >> 8) & 0x3;
-      if (Op == 1)
-	O << "cut";
-      else if (Op == 2)
-	O << "emit";
-      else if (Op == 3)
-	O << "emit-cut";
-      O << " stream " << Stream;
-    }
-    O << "), [m0] ";
-  } else if (Msg == 1)
-    O << "interrupt ";
-  else if (Msg == 15)
-    O << "system ";
-  else
-    O << "unknown(" << Msg << ") ";
-}
-
-void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
-                                      raw_ostream &O) {
-  // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs
-  // SIInsertWaits.cpp bits usage does not match ISA docs description but it
-  // works so it might be a misprint in docs.
-  unsigned SImm16 = MI->getOperand(OpNo).getImm();
-  unsigned Vmcnt = SImm16 & 0xF;
-  unsigned Expcnt = (SImm16 >> 4) & 0xF;
-  unsigned Lgkmcnt = (SImm16 >> 8) & 0xF;
-
-  bool NeedSpace = false;
-
-  if (Vmcnt != 0xF) {
-    O << "vmcnt(" << Vmcnt << ')';
-    NeedSpace = true;
-  }
-
-  if (Expcnt != 0x7) {
-    if (NeedSpace)
-      O << ' ';
-    O << "expcnt(" << Expcnt << ')';
-    NeedSpace = true;
-  }
-
-  if (Lgkmcnt != 0x7) {
-    if (NeedSpace)
-      O << ' ';
-    O << "lgkmcnt(" << Lgkmcnt << ')';
-  }
-}
-
-#include "AMDGPUGenAsmWriter.inc"
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
deleted file mode 100644
index 14fb511e923..00000000000
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ /dev/null
@@ -1,88 +0,0 @@
-//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
-#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-
-class AMDGPUInstPrinter : public MCInstPrinter {
-public:
-  AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                     const MCRegisterInfo &MRI)
-    : MCInstPrinter(MAI, MII, MRI) {}
-
-  //Autogenerated by tblgen
-  void printInstruction(const MCInst *MI, raw_ostream &O);
-  static const char *getRegisterName(unsigned RegNo);
-
-  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
-                 const MCSubtargetInfo &STI) override;
-  static void printRegOperand(unsigned RegNo, raw_ostream &O,
-                              const MCRegisterInfo &MRI);
-
-private:
-  void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printRegOperand(unsigned RegNo, raw_ostream &O);
-  void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printImmediate32(uint32_t I, raw_ostream &O);
-  void printImmediate64(uint64_t I, raw_ostream &O);
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
-                         StringRef Asm, StringRef Default = "");
-  static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printUpdateExecMask(const MCInst *MI, unsigned OpNo,
-                                  raw_ostream &O);
-  static void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/lib/Target/R600/InstPrinter/CMakeLists.txt b/lib/Target/R600/InstPrinter/CMakeLists.txt
deleted file mode 100644
index dcd87037fab..00000000000
--- a/lib/Target/R600/InstPrinter/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_llvm_library(LLVMR600AsmPrinter
-  AMDGPUInstPrinter.cpp
-  )
diff --git a/lib/Target/R600/InstPrinter/LLVMBuild.txt b/lib/Target/R600/InstPrinter/LLVMBuild.txt
deleted file mode 100644
index ec0be89f104..00000000000
--- a/lib/Target/R600/InstPrinter/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/R600/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = R600AsmPrinter
-parent = R600
-required_libraries = MC Support
-add_to_library_groups = R600
-
diff --git a/lib/Target/R600/InstPrinter/Makefile b/lib/Target/R600/InstPrinter/Makefile
deleted file mode 100644
index a794cc1124e..00000000000
--- a/lib/Target/R600/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-#===- lib/Target/R600/AsmPrinter/Makefile ------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMR600AsmPrinter
-
-# Hack: we need to include 'main' x86 target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/R600/LLVMBuild.txt b/lib/Target/R600/LLVMBuild.txt
deleted file mode 100644
index f3f254fdcba..00000000000
--- a/lib/Target/R600/LLVMBuild.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-;===- ./lib/Target/AMDIL/LLVMBuild.txt -------------------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[common]
-subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo
-
-[component_0]
-type = TargetGroup
-name = R600
-parent = Target
-has_asmparser = 1
-has_asmprinter = 1
-
-[component_1]
-type = Library
-name = R600CodeGen
-parent = R600
-required_libraries = Analysis AsmPrinter CodeGen Core IPO MC R600AsmParser R600AsmPrinter R600Desc R600Info Scalar SelectionDAG Support Target TransformUtils
-add_to_library_groups = R600
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
deleted file mode 100644
index 8bed2deef4c..00000000000
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "MCTargetDesc/AMDGPUFixupKinds.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCFixupKindInfo.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/TargetRegistry.h"
-
-using namespace llvm;
-
-namespace {
-
-class AMDGPUMCObjectWriter : public MCObjectWriter {
-public:
-  AMDGPUMCObjectWriter(raw_pwrite_stream &OS) : MCObjectWriter(OS, true) {}
-  void executePostLayoutBinding(MCAssembler &Asm,
-                                const MCAsmLayout &Layout) override {
-    //XXX: Implement if necessary.
-  }
-  void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
-                        const MCFragment *Fragment, const MCFixup &Fixup,
-                        MCValue Target, bool &IsPCRel,
-                        uint64_t &FixedValue) override {
-    assert(!"Not implemented");
-  }
-
-  void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
-
-};
-
-class AMDGPUAsmBackend : public MCAsmBackend {
-public:
-  AMDGPUAsmBackend(const Target &T)
-    : MCAsmBackend() {}
-
-  unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
-  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                            const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const override {
-    return false;
-  }
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
-    assert(!"Not implemented");
-  }
-  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
-
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
-};
-
-} //End anonymous namespace
-
-void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm,
-                                       const MCAsmLayout &Layout) {
-  for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) {
-    Asm.writeSectionData(&*I, Layout);
-  }
-}
-
-void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                  unsigned DataSize, uint64_t Value,
-                                  bool IsPCRel) const {
-
-  switch ((unsigned)Fixup.getKind()) {
-    default: llvm_unreachable("Unknown fixup kind");
-    case AMDGPU::fixup_si_sopp_br: {
-      uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
-      *Dst = (Value - 4) / 4;
-      break;
-    }
-
-    case AMDGPU::fixup_si_rodata: {
-      uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
-      *Dst = Value;
-      break;
-    }
-
-    case AMDGPU::fixup_si_end_of_text: {
-      uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
-      // The value points to the last instruction in the text section, so we
-      // need to add 4 bytes to get to the start of the constants.
-      *Dst = Value + 4;
-      break;
-    }
-  }
-}
-
-const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
-                                                       MCFixupKind Kind) const {
-  const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
-    // name                   offset bits  flags
-    { "fixup_si_sopp_br",     0,     16,   MCFixupKindInfo::FKF_IsPCRel },
-    { "fixup_si_rodata",      0,     32,   0 },
-    { "fixup_si_end_of_text", 0,     32,   MCFixupKindInfo::FKF_IsPCRel }
-  };
-
-  if (Kind < FirstTargetFixupKind)
-    return MCAsmBackend::getFixupKindInfo(Kind);
-
-  return Infos[Kind - FirstTargetFixupKind];
-}
-
-bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  OW->WriteZeros(Count);
-
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// ELFAMDGPUAsmBackend class
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
-public:
-  ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { }
-
-  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createAMDGPUELFObjectWriter(OS);
-  }
-};
-
-} // end anonymous namespace
-
-MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
-                                           const MCRegisterInfo &MRI,
-                                           const Triple &TT, StringRef CPU) {
-  return new ELFAMDGPUAsmBackend(T);
-}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp
deleted file mode 100644
index 59f45ff02d8..00000000000
--- a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUMCTargetDesc.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCFixup.h"
-
-using namespace llvm;
-
-namespace {
-
-class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
-public:
-  AMDGPUELFObjectWriter();
-protected:
-  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsPCRel) const override {
-    return Fixup.getKind();
-  }
-
-};
-
-
-} // End anonymous namespace
-
-AMDGPUELFObjectWriter::AMDGPUELFObjectWriter()
-  : MCELFObjectTargetWriter(false, 0, 0, false) { }
-
-MCObjectWriter *llvm::createAMDGPUELFObjectWriter(raw_pwrite_stream &OS) {
-  MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter();
-  return createELFObjectWriter(MOTW, OS, true);
-}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h
deleted file mode 100644
index 01021d67ffd..00000000000
--- a/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H
-#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H
-
-#include "llvm/MC/MCFixup.h"
-
-namespace llvm {
-namespace AMDGPU {
-enum Fixups {
-  /// 16-bit PC relative fixup for SOPP branch instructions.
-  fixup_si_sopp_br = FirstTargetFixupKind,
-
-  /// fixup for global addresses with constant initializers
-  fixup_si_rodata,
-
-  /// fixup for offset from instruction to end of text section
-  fixup_si_end_of_text,
-
-  // Marker
-  LastTargetFixupKind,
-  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
-};
-}
-}
-
-#endif
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
deleted file mode 100644
index 028a86dfc7a..00000000000
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUMCAsmInfo.h"
-
-using namespace llvm;
-AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
-  HasSingleParameterDotFile = false;
-  //===------------------------------------------------------------------===//
-  MaxInstLength = 16;
-  SeparatorString = "\n";
-  CommentString = ";";
-  PrivateLabelPrefix = "";
-  InlineAsmStart = ";#ASMSTART";
-  InlineAsmEnd = ";#ASMEND";
-
-  //===--- Data Emission Directives -------------------------------------===//
-  ZeroDirective = ".zero";
-  AsciiDirective = ".ascii\t";
-  AscizDirective = ".asciz\t";
-  Data8bitsDirective = ".byte\t";
-  Data16bitsDirective = ".short\t";
-  Data32bitsDirective = ".long\t";
-  Data64bitsDirective = ".quad\t";
-  SunStyleELFSectionSwitchSyntax = true;
-  UsesELFSectionDirectiveForBSS = true;
-
-  //===--- Global Variable Emission Directives --------------------------===//
-  HasAggressiveSymbolFolding = true;
-  COMMDirectiveAlignmentIsInBytes = false;
-  HasDotTypeDotSizeDirective = false;
-  HasNoDeadStrip = true;
-  WeakRefDirective = ".weakref\t";
-  //===--- Dwarf Emission Directives -----------------------------------===//
-  SupportsDebugInformation = true;
-}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
deleted file mode 100644
index a5bac51e356..00000000000
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H
-#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H
-
-#include "llvm/MC/MCAsmInfoELF.h"
-namespace llvm {
-
-class Triple;
-
-// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo,
-// you will need to make sure your new class sets PrivateGlobalPrefix to
-// a prefix that won't appeary in a fuction name.  The default value
-// for PrivateGlobalPrefix is 'L', so it will consider any function starting
-// with 'L' as a local symbol.
-class AMDGPUMCAsmInfo : public MCAsmInfoELF {
-public:
-  explicit AMDGPUMCAsmInfo(const Triple &TT);
-};
-} // namespace llvm
-#endif
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
deleted file mode 100644
index 521b3b39bba..00000000000
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief CodeEmitter interface for R600 and SI codegen.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUMCCodeEmitter.h"
-
-using namespace llvm;
-
-// pin vtable to this file
-void AMDGPUMCCodeEmitter::anchor() {}
-
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
deleted file mode 100644
index c9574276223..00000000000
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ /dev/null
@@ -1,50 +0,0 @@
-//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief CodeEmitter interface for R600 and SI codegen.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
-#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
-
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-
-class MCInst;
-class MCOperand;
-class MCSubtargetInfo;
-
-class AMDGPUMCCodeEmitter : public MCCodeEmitter {
-  virtual void anchor();
-public:
-
-  uint64_t getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const {
-    return 0;
-  }
-
-  virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const {
-    return 0;
-  }
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
deleted file mode 100644
index 02192c40f92..00000000000
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief This file provides AMDGPU specific target descriptions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUMCTargetDesc.h"
-#include "AMDGPUMCAsmInfo.h"
-#include "InstPrinter/AMDGPUInstPrinter.h"
-#include "SIDefines.h"
-#include "llvm/MC/MCCodeGenInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MachineLocation.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-
-using namespace llvm;
-
-#define GET_INSTRINFO_MC_DESC
-#include "AMDGPUGenInstrInfo.inc"
-
-#define GET_SUBTARGETINFO_MC_DESC
-#include "AMDGPUGenSubtargetInfo.inc"
-
-#define GET_REGINFO_MC_DESC
-#include "AMDGPUGenRegisterInfo.inc"
-
-static MCInstrInfo *createAMDGPUMCInstrInfo() {
-  MCInstrInfo *X = new MCInstrInfo();
-  InitAMDGPUMCInstrInfo(X);
-  return X;
-}
-
-static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) {
-  MCRegisterInfo *X = new MCRegisterInfo();
-  InitAMDGPUMCRegisterInfo(X, 0);
-  return X;
-}
-
-static MCSubtargetInfo *
-createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo * X = new MCSubtargetInfo();
-  InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
-}
-
-static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                               CodeModel::Model CM,
-                                               CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
-static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T,
-                                                unsigned SyntaxVariant,
-                                                const MCAsmInfo &MAI,
-                                                const MCInstrInfo &MII,
-                                                const MCRegisterInfo &MRI) {
-  return new AMDGPUInstPrinter(MAI, MII, MRI);
-}
-
-extern "C" void LLVMInitializeR600TargetMC() {
-  for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) {
-    RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
-
-    TargetRegistry::RegisterMCCodeGenInfo(*T, createAMDGPUMCCodeGenInfo);
-    TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo);
-    TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo);
-    TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
-    TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter);
-    TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend);
-  }
-
-  TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget,
-                                        createR600MCCodeEmitter);
-  TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createSIMCCodeEmitter);
-}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
deleted file mode 100644
index 92e29dc7037..00000000000
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ /dev/null
@@ -1,61 +0,0 @@
-//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Provides AMDGPU specific target descriptions.
-//
-//===----------------------------------------------------------------------===//
-//
-
-#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H
-#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H
-
-#include "llvm/Support/DataTypes.h"
-#include "llvm/ADT/StringRef.h"
-
-namespace llvm {
-class MCAsmBackend;
-class MCCodeEmitter;
-class MCContext;
-class MCInstrInfo;
-class MCObjectWriter;
-class MCRegisterInfo;
-class MCSubtargetInfo;
-class Target;
-class Triple;
-class raw_pwrite_stream;
-class raw_ostream;
-
-extern Target TheAMDGPUTarget;
-extern Target TheGCNTarget;
-
-MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
-                                       const MCRegisterInfo &MRI,
-                                       MCContext &Ctx);
-
-MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
-                                     const MCRegisterInfo &MRI,
-                                     MCContext &Ctx);
-
-MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                                     const Triple &TT, StringRef CPU);
-
-MCObjectWriter *createAMDGPUELFObjectWriter(raw_pwrite_stream &OS);
-} // End llvm namespace
-
-#define GET_REGINFO_ENUM
-#include "AMDGPUGenRegisterInfo.inc"
-
-#define GET_INSTRINFO_ENUM
-#include "AMDGPUGenInstrInfo.inc"
-
-#define GET_SUBTARGETINFO_ENUM
-#include "AMDGPUGenSubtargetInfo.inc"
-
-#endif
diff --git a/lib/Target/R600/MCTargetDesc/CMakeLists.txt b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
deleted file mode 100644
index 801c9054937..00000000000
--- a/lib/Target/R600/MCTargetDesc/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-
-add_llvm_library(LLVMR600Desc
-  AMDGPUAsmBackend.cpp
-  AMDGPUELFObjectWriter.cpp
-  AMDGPUMCCodeEmitter.cpp
-  AMDGPUMCTargetDesc.cpp
-  AMDGPUMCAsmInfo.cpp
-  R600MCCodeEmitter.cpp
-  SIMCCodeEmitter.cpp
-  )
diff --git a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
deleted file mode 100644
index 74b8ca09ae1..00000000000
--- a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt -------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = R600Desc
-parent = R600
-required_libraries = MC R600AsmPrinter R600Info Support
-add_to_library_groups = R600
diff --git a/lib/Target/R600/MCTargetDesc/Makefile b/lib/Target/R600/MCTargetDesc/Makefile
deleted file mode 100644
index 8894a7607f4..00000000000
--- a/lib/Target/R600/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AMDGPU/TargetDesc/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMR600Desc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
deleted file mode 100644
index e683498d52a..00000000000
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// \brief The R600 code emitter produces machine code that can be executed
-/// directly on the GPU device.
-//
-//===----------------------------------------------------------------------===//
-
-#include "R600Defines.h"
-#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/EndianStream.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-
-class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
-  R600MCCodeEmitter(const R600MCCodeEmitter &) = delete;
-  void operator=(const R600MCCodeEmitter &) = delete;
-  const MCInstrInfo &MCII;
-  const MCRegisterInfo &MRI;
-
-public:
-
-  R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
-    : MCII(mcii), MRI(mri) { }
-
-  /// \brief Encode the instruction and write it to the OS.
-  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const override;
-
-  /// \returns the encoding for an MCOperand.
-  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const override;
-private:
-
-  void EmitByte(unsigned int byte, raw_ostream &OS) const;
-
-  void Emit(uint32_t value, raw_ostream &OS) const;
-  void Emit(uint64_t value, raw_ostream &OS) const;
-
-  unsigned getHWRegChan(unsigned reg) const;
-  unsigned getHWReg(unsigned regNo) const;
-
-};
-
-} // End anonymous namespace
-
-enum RegElement {
-  ELEMENT_X = 0,
-  ELEMENT_Y,
-  ELEMENT_Z,
-  ELEMENT_W
-};
-
-enum FCInstr {
-  FC_IF_PREDICATE = 0,
-  FC_ELSE,
-  FC_ENDIF,
-  FC_BGNLOOP,
-  FC_ENDLOOP,
-  FC_BREAK_PREDICATE,
-  FC_CONTINUE
-};
-
-MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
-                                             const MCRegisterInfo &MRI,
-					     MCContext &Ctx) {
-  return new R600MCCodeEmitter(MCII, MRI);
-}
-
-void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
-  if (MI.getOpcode() == AMDGPU::RETURN ||
-    MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
-    MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
-    MI.getOpcode() == AMDGPU::BUNDLE ||
-    MI.getOpcode() == AMDGPU::KILL) {
-    return;
-  } else if (IS_VTX(Desc)) {
-    uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI);
-    uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
-    if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) {
-      InstWord2 |= 1 << 19; // Mega-Fetch bit
-    }
-
-    Emit(InstWord01, OS);
-    Emit(InstWord2, OS);
-    Emit((uint32_t) 0, OS);
-  } else if (IS_TEX(Desc)) {
-      int64_t Sampler = MI.getOperand(14).getImm();
-
-      int64_t SrcSelect[4] = {
-        MI.getOperand(2).getImm(),
-        MI.getOperand(3).getImm(),
-        MI.getOperand(4).getImm(),
-        MI.getOperand(5).getImm()
-      };
-      int64_t Offsets[3] = {
-        MI.getOperand(6).getImm() & 0x1F,
-        MI.getOperand(7).getImm() & 0x1F,
-        MI.getOperand(8).getImm() & 0x1F
-      };
-
-      uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups, STI);
-      uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 |
-          SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 |
-          SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 |
-          Offsets[2] << 10;
-
-      Emit(Word01, OS);
-      Emit(Word2, OS);
-      Emit((uint32_t) 0, OS);
-  } else {
-    uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI);
-    if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) &&
-       ((Desc.TSFlags & R600_InstFlag::OP1) ||
-         Desc.TSFlags & R600_InstFlag::OP2)) {
-      uint64_t ISAOpCode = Inst & (0x3FFULL << 39);
-      Inst &= ~(0x3FFULL << 39);
-      Inst |= ISAOpCode << 1;
-    }
-    Emit(Inst, OS);
-  }
-}
-
-void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const {
-  OS.write((uint8_t) Byte & 0xff);
-}
-
-void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
-  support::endian::Writer<support::little>(OS).write(Value);
-}
-
-void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
-  support::endian::Writer<support::little>(OS).write(Value);
-}
-
-unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const {
-  return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT;
-}
-
-unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
-  return MRI.getEncodingValue(RegNo) & HW_REG_MASK;
-}
-
-uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
-                                              const MCOperand &MO,
-                                        SmallVectorImpl<MCFixup> &Fixup,
-                                        const MCSubtargetInfo &STI) const {
-  if (MO.isReg()) {
-    if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags))
-      return MRI.getEncodingValue(MO.getReg());
-    return getHWReg(MO.getReg());
-  }
-
-  assert(MO.isImm());
-  return MO.getImm();
-}
-
-#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
deleted file mode 100644
index 65a0eeba2b1..00000000000
--- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
+++ /dev/null
@@ -1,289 +0,0 @@
-//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief The SI code emitter produces machine code that can be executed
-/// directly on the GPU device.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "MCTargetDesc/AMDGPUFixupKinds.h"
-#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIDefines.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCFixup.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-
-class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
-  SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
-  void operator=(const SIMCCodeEmitter &) = delete;
-  const MCInstrInfo &MCII;
-  const MCRegisterInfo &MRI;
-  MCContext &Ctx;
-
-  /// \brief Can this operand also contain immediate values?
-  bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
-
-  /// \brief Encode an fp or int literal
-  uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize) const;
-
-public:
-  SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
-                  MCContext &ctx)
-    : MCII(mcii), MRI(mri), Ctx(ctx) { }
-
-  ~SIMCCodeEmitter() override {}
-
-  /// \brief Encode the instruction and write it to the OS.
-  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const override;
-
-  /// \returns the encoding for an MCOperand.
-  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const override;
-
-  /// \brief Use a fixup to encode the simm16 field for SOPP branch
-  ///        instructions.
-  unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const override;
-};
-
-} // End anonymous namespace
-
-MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
-                                           const MCRegisterInfo &MRI,
-                                           MCContext &Ctx) {
-  return new SIMCCodeEmitter(MCII, MRI, Ctx);
-}
-
-bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc,
-                                   unsigned OpNo) const {
-  unsigned OpType = Desc.OpInfo[OpNo].OperandType;
-
-  return OpType == AMDGPU::OPERAND_REG_IMM32 ||
-         OpType == AMDGPU::OPERAND_REG_INLINE_C;
-}
-
-// Returns the encoding value to use if the given integer is an integer inline
-// immediate value, or 0 if it is not.
-template <typename IntTy>
-static uint32_t getIntInlineImmEncoding(IntTy Imm) {
-  if (Imm >= 0 && Imm <= 64)
-    return 128 + Imm;
-
-  if (Imm >= -16 && Imm <= -1)
-    return 192 + std::abs(Imm);
-
-  return 0;
-}
-
-static uint32_t getLit32Encoding(uint32_t Val) {
-  uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val));
-  if (IntImm != 0)
-    return IntImm;
-
-  if (Val == FloatToBits(0.5f))
-    return 240;
-
-  if (Val == FloatToBits(-0.5f))
-    return 241;
-
-  if (Val == FloatToBits(1.0f))
-    return 242;
-
-  if (Val == FloatToBits(-1.0f))
-    return 243;
-
-  if (Val == FloatToBits(2.0f))
-    return 244;
-
-  if (Val == FloatToBits(-2.0f))
-    return 245;
-
-  if (Val == FloatToBits(4.0f))
-    return 246;
-
-  if (Val == FloatToBits(-4.0f))
-    return 247;
-
-  return 255;
-}
-
-static uint32_t getLit64Encoding(uint64_t Val) {
-  uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val));
-  if (IntImm != 0)
-    return IntImm;
-
-  if (Val == DoubleToBits(0.5))
-    return 240;
-
-  if (Val == DoubleToBits(-0.5))
-    return 241;
-
-  if (Val == DoubleToBits(1.0))
-    return 242;
-
-  if (Val == DoubleToBits(-1.0))
-    return 243;
-
-  if (Val == DoubleToBits(2.0))
-    return 244;
-
-  if (Val == DoubleToBits(-2.0))
-    return 245;
-
-  if (Val == DoubleToBits(4.0))
-    return 246;
-
-  if (Val == DoubleToBits(-4.0))
-    return 247;
-
-  return 255;
-}
-
-uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
-                                         unsigned OpSize) const {
-  if (MO.isExpr())
-    return 255;
-
-  assert(!MO.isFPImm());
-
-  if (!MO.isImm())
-    return ~0;
-
-  if (OpSize == 4)
-    return getLit32Encoding(static_cast<uint32_t>(MO.getImm()));
-
-  assert(OpSize == 8);
-
-  return getLit64Encoding(static_cast<uint64_t>(MO.getImm()));
-}
-
-void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-
-  uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI);
-  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
-  unsigned bytes = Desc.getSize();
-
-  for (unsigned i = 0; i < bytes; i++) {
-    OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
-  }
-
-  if (bytes > 4)
-    return;
-
-  // Check for additional literals in SRC0/1/2 (Op 1/2/3)
-  for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
-
-    // Check if this operand should be encoded as [SV]Src
-    if (!isSrcOperand(Desc, i))
-      continue;
-
-    int RCID = Desc.OpInfo[i].RegClass;
-    const MCRegisterClass &RC = MRI.getRegClass(RCID);
-
-    // Is this operand a literal immediate?
-    const MCOperand &Op = MI.getOperand(i);
-    if (getLitEncoding(Op, RC.getSize()) != 255)
-      continue;
-
-    // Yes! Encode it
-    int64_t Imm = 0;
-
-    if (Op.isImm())
-      Imm = Op.getImm();
-    else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
-      llvm_unreachable("Must be immediate or expr");
-
-    for (unsigned j = 0; j < 4; j++) {
-      OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff));
-    }
-
-    // Only one literal value allowed
-    break;
-  }
-}
-
-unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
-                                            SmallVectorImpl<MCFixup> &Fixups,
-                                            const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpNo);
-
-  if (MO.isExpr()) {
-    const MCExpr *Expr = MO.getExpr();
-    MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br;
-    Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
-    return 0;
-  }
-
-  return getMachineOpValue(MI, MO, Fixups, STI);
-}
-
-uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
-                                            const MCOperand &MO,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  if (MO.isReg())
-    return MRI.getEncodingValue(MO.getReg());
-
-  if (MO.isExpr()) {
-    const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr());
-    MCFixupKind Kind;
-    const MCSymbol *Sym =
-        Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-
-    if (&Expr->getSymbol() == Sym) {
-      // Add the offset to the beginning of the constant values.
-      Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text;
-    } else {
-      // This is used for constant data stored in .rodata.
-     Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
-    }
-    Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc()));
-  }
-
-  // Figure out the operand number, needed for isSrcOperand check
-  unsigned OpNo = 0;
-  for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) {
-    if (&MO == &MI.getOperand(OpNo))
-      break;
-  }
-
-  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
-  if (isSrcOperand(Desc, OpNo)) {
-    int RCID = Desc.OpInfo[OpNo].RegClass;
-    const MCRegisterClass &RC = MRI.getRegClass(RCID);
-
-    uint32_t Enc = getLitEncoding(MO, RC.getSize());
-    if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
-      return Enc;
-
-  } else if (MO.isImm())
-    return MO.getImm();
-
-  llvm_unreachable("Encoding of this operand type is not supported yet.");
-  return 0;
-}
-
diff --git a/lib/Target/R600/Makefile b/lib/Target/R600/Makefile
deleted file mode 100644
index 64a7c8c045c..00000000000
--- a/lib/Target/R600/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-##===- lib/Target/R600/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMR600CodeGen
-TARGET = AMDGPU
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \
-		AMDGPUGenDAGISel.inc  AMDGPUGenSubtargetInfo.inc \
-		AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \
-		AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \
-		AMDGPUGenAsmWriter.inc AMDGPUGenAsmMatcher.inc
-
-DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
deleted file mode 100644
index c0ffede5199..00000000000
--- a/lib/Target/R600/Processors.td
+++ /dev/null
@@ -1,137 +0,0 @@
-//===-- Processors.td - R600 Processor definitions ------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
-: Processor<Name, itin, Features>;
-
-//===----------------------------------------------------------------------===//
-// R600
-//===----------------------------------------------------------------------===//
-def : Proc<"",           R600_VLIW5_Itin,
-    [FeatureR600, FeatureVertexCache]>;
-
-def : Proc<"r600",       R600_VLIW5_Itin,
-    [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>;
-
-def : Proc<"r630",       R600_VLIW5_Itin,
-    [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>;
-
-def : Proc<"rs880",      R600_VLIW5_Itin,
-    [FeatureR600, FeatureWavefrontSize16]>;
-
-def : Proc<"rv670",      R600_VLIW5_Itin,
-    [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
-
-//===----------------------------------------------------------------------===//
-// R700
-//===----------------------------------------------------------------------===//
-
-def : Proc<"rv710",      R600_VLIW5_Itin,
-    [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
-
-def : Proc<"rv730",      R600_VLIW5_Itin,
-    [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
-
-def : Proc<"rv770",      R600_VLIW5_Itin,
-    [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
-
-//===----------------------------------------------------------------------===//
-// Evergreen
-//===----------------------------------------------------------------------===//
-
-def : Proc<"cedar",      R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32,
-     FeatureCFALUBug]>;
-
-def : Proc<"redwood",    R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64,
-     FeatureCFALUBug]>;
-
-def : Proc<"sumo",       R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>;
-
-def : Proc<"juniper",    R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
-
-def : Proc<"cypress",    R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureFP64, FeatureVertexCache,
-     FeatureWavefrontSize64]>;
-
-//===----------------------------------------------------------------------===//
-// Northern Islands
-//===----------------------------------------------------------------------===//
-
-def : Proc<"barts",      R600_VLIW5_Itin,
-    [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
-
-def : Proc<"turks",      R600_VLIW5_Itin,
-    [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
-
-def : Proc<"caicos",     R600_VLIW5_Itin,
-    [FeatureNorthernIslands, FeatureCFALUBug]>;
-
-def : Proc<"cayman",     R600_VLIW4_Itin,
-    [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>;
-
-//===----------------------------------------------------------------------===//
-// Southern Islands
-//===----------------------------------------------------------------------===//
-
-def : ProcessorModel<"SI", SIFullSpeedModel,
-  [FeatureSouthernIslands, FeatureFastFMAF32]
->;
-
-def : ProcessorModel<"tahiti",   SIFullSpeedModel,
-  [FeatureSouthernIslands, FeatureFastFMAF32]
->;
-
-def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
-
-def : ProcessorModel<"verde",    SIQuarterSpeedModel, [FeatureSouthernIslands]>;
-
-def : ProcessorModel<"oland",    SIQuarterSpeedModel, [FeatureSouthernIslands]>;
-
-def : ProcessorModel<"hainan",   SIQuarterSpeedModel, [FeatureSouthernIslands]>;
-
-//===----------------------------------------------------------------------===//
-// Sea Islands
-//===----------------------------------------------------------------------===//
-
-def : ProcessorModel<"bonaire",    SIQuarterSpeedModel,
-  [FeatureSeaIslands, FeatureLDSBankCount32]
->;
-
-def : ProcessorModel<"kabini",     SIQuarterSpeedModel,
-  [FeatureSeaIslands, FeatureLDSBankCount16]
->;
-
-def : ProcessorModel<"kaveri",     SIQuarterSpeedModel,
-  [FeatureSeaIslands, FeatureLDSBankCount32]
->;
-
-def : ProcessorModel<"hawaii", SIFullSpeedModel,
-  [FeatureSeaIslands, FeatureFastFMAF32, FeatureLDSBankCount32]
->;
-
-def : ProcessorModel<"mullins",    SIQuarterSpeedModel,
-  [FeatureSeaIslands, FeatureLDSBankCount16]>;
-
-//===----------------------------------------------------------------------===//
-// Volcanic Islands
-//===----------------------------------------------------------------------===//
-
-def : ProcessorModel<"tonga",   SIQuarterSpeedModel,
-  [FeatureVolcanicIslands, FeatureSGPRInitBug]
->;
-
-def : ProcessorModel<"iceland", SIQuarterSpeedModel,
-  [FeatureVolcanicIslands, FeatureSGPRInitBug]
->;
-
-def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp
deleted file mode 100644
index 3cb90218a7d..00000000000
--- a/lib/Target/R600/R600ClauseMergePass.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer.
-/// This pass is merging consecutive CFAlus where applicable.
-/// It needs to be called after IfCvt for best results.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "R600RegisterInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "r600mergeclause"
-
-namespace {
-
-static bool isCFAlu(const MachineInstr *MI) {
-  switch (MI->getOpcode()) {
-  case AMDGPU::CF_ALU:
-  case AMDGPU::CF_ALU_PUSH_BEFORE:
-    return true;
-  default:
-    return false;
-  }
-}
-
-class R600ClauseMergePass : public MachineFunctionPass {
-
-private:
-  static char ID;
-  const R600InstrInfo *TII;
-
-  unsigned getCFAluSize(const MachineInstr *MI) const;
-  bool isCFAluEnabled(const MachineInstr *MI) const;
-
-  /// IfCvt pass can generate "disabled" ALU clause marker that need to be
-  /// removed and their content affected to the previous alu clause.
-  /// This function parse instructions after CFAlu until it find a disabled
-  /// CFAlu and merge the content, or an enabled CFAlu.
-  void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const;
-
-  /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if
-  /// it is the case.
-  bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu)
-      const;
-
-public:
-  R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override;
-};
-
-char R600ClauseMergePass::ID = 0;
-
-unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const {
-  assert(isCFAlu(MI));
-  return MI->getOperand(
-      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm();
-}
-
-bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const {
-  assert(isCFAlu(MI));
-  return MI->getOperand(
-      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm();
-}
-
-void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu)
-    const {
-  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
-  MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end();
-  I++;
-  do {
-    while (I!= E && !isCFAlu(I))
-      I++;
-    if (I == E)
-      return;
-    MachineInstr *MI = I++;
-    if (isCFAluEnabled(MI))
-      break;
-    CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI));
-    MI->eraseFromParent();
-  } while (I != E);
-}
-
-bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
-                                          const MachineInstr *LatrCFAlu) const {
-  assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu));
-  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
-  unsigned RootInstCount = getCFAluSize(RootCFAlu),
-      LaterInstCount = getCFAluSize(LatrCFAlu);
-  unsigned CumuledInsts = RootInstCount + LaterInstCount;
-  if (CumuledInsts >= TII->getMaxAlusPerClause()) {
-    DEBUG(dbgs() << "Excess inst counts\n");
-    return false;
-  }
-  if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
-    return false;
-  // Is KCache Bank 0 compatible ?
-  int Mode0Idx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0);
-  int KBank0Idx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0);
-  int KBank0LineIdx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0);
-  if (LatrCFAlu->getOperand(Mode0Idx).getImm() &&
-      RootCFAlu->getOperand(Mode0Idx).getImm() &&
-      (LatrCFAlu->getOperand(KBank0Idx).getImm() !=
-       RootCFAlu->getOperand(KBank0Idx).getImm() ||
-      LatrCFAlu->getOperand(KBank0LineIdx).getImm() !=
-      RootCFAlu->getOperand(KBank0LineIdx).getImm())) {
-    DEBUG(dbgs() << "Wrong KC0\n");
-    return false;
-  }
-  // Is KCache Bank 1 compatible ?
-  int Mode1Idx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1);
-  int KBank1Idx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1);
-  int KBank1LineIdx =
-      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1);
-  if (LatrCFAlu->getOperand(Mode1Idx).getImm() &&
-      RootCFAlu->getOperand(Mode1Idx).getImm() &&
-      (LatrCFAlu->getOperand(KBank1Idx).getImm() !=
-      RootCFAlu->getOperand(KBank1Idx).getImm() ||
-      LatrCFAlu->getOperand(KBank1LineIdx).getImm() !=
-      RootCFAlu->getOperand(KBank1LineIdx).getImm())) {
-    DEBUG(dbgs() << "Wrong KC0\n");
-    return false;
-  }
-  if (LatrCFAlu->getOperand(Mode0Idx).getImm()) {
-    RootCFAlu->getOperand(Mode0Idx).setImm(
-        LatrCFAlu->getOperand(Mode0Idx).getImm());
-    RootCFAlu->getOperand(KBank0Idx).setImm(
-        LatrCFAlu->getOperand(KBank0Idx).getImm());
-    RootCFAlu->getOperand(KBank0LineIdx).setImm(
-        LatrCFAlu->getOperand(KBank0LineIdx).getImm());
-  }
-  if (LatrCFAlu->getOperand(Mode1Idx).getImm()) {
-    RootCFAlu->getOperand(Mode1Idx).setImm(
-        LatrCFAlu->getOperand(Mode1Idx).getImm());
-    RootCFAlu->getOperand(KBank1Idx).setImm(
-        LatrCFAlu->getOperand(KBank1Idx).getImm());
-    RootCFAlu->getOperand(KBank1LineIdx).setImm(
-        LatrCFAlu->getOperand(KBank1LineIdx).getImm());
-  }
-  RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts);
-  RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode()));
-  return true;
-}
-
-bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-    MachineBasicBlock::iterator I = MBB.begin(),  E = MBB.end();
-    MachineBasicBlock::iterator LatestCFAlu = E;
-    while (I != E) {
-      MachineInstr *MI = I++;
-      if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) ||
-          TII->mustBeLastInClause(MI->getOpcode()))
-        LatestCFAlu = E;
-      if (!isCFAlu(MI))
-        continue;
-      cleanPotentialDisabledCFAlu(MI);
-
-      if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) {
-        MI->eraseFromParent();
-      } else {
-        assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled");
-        LatestCFAlu = MI;
-      }
-    }
-  }
-  return false;
-}
-
-const char *R600ClauseMergePass::getPassName() const {
-  return "R600 Merge Clause Markers Pass";
-}
-
-} // end anonymous namespace
-
-
-llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) {
-  return new R600ClauseMergePass(TM);
-}
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
deleted file mode 100644
index c8f37f61fc1..00000000000
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ /dev/null
@@ -1,679 +0,0 @@
-//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass compute turns all control flow pseudo instructions into native one
-/// computing their address on the fly ; it also sets STACK_SIZE info.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/Debug.h"
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "R600RegisterInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "r600cf"
-
-namespace {
-
-struct CFStack {
-
-  enum StackItem {
-    ENTRY = 0,
-    SUB_ENTRY = 1,
-    FIRST_NON_WQM_PUSH = 2,
-    FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
-  };
-
-  const AMDGPUSubtarget *ST;
-  std::vector<StackItem> BranchStack;
-  std::vector<StackItem> LoopStack;
-  unsigned MaxStackSize;
-  unsigned CurrentEntries;
-  unsigned CurrentSubEntries;
-
-  CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st),
-      // We need to reserve a stack entry for CALL_FS in vertex shaders.
-      MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
-      CurrentEntries(0), CurrentSubEntries(0) { }
-
-  unsigned getLoopDepth();
-  bool branchStackContains(CFStack::StackItem);
-  bool requiresWorkAroundForInst(unsigned Opcode);
-  unsigned getSubEntrySize(CFStack::StackItem Item);
-  void updateMaxStackSize();
-  void pushBranch(unsigned Opcode, bool isWQM = false);
-  void pushLoop();
-  void popBranch();
-  void popLoop();
-};
-
-unsigned CFStack::getLoopDepth() {
-  return LoopStack.size();
-}
-
-bool CFStack::branchStackContains(CFStack::StackItem Item) {
-  for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
-       E = BranchStack.end(); I != E; ++I) {
-    if (*I == Item)
-      return true;
-  }
-  return false;
-}
-
-bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
-  if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
-      getLoopDepth() > 1)
-    return true;
-
-  if (!ST->hasCFAluBug())
-    return false;
-
-  switch(Opcode) {
-  default: return false;
-  case AMDGPU::CF_ALU_PUSH_BEFORE:
-  case AMDGPU::CF_ALU_ELSE_AFTER:
-  case AMDGPU::CF_ALU_BREAK:
-  case AMDGPU::CF_ALU_CONTINUE:
-    if (CurrentSubEntries == 0)
-      return false;
-    if (ST->getWavefrontSize() == 64) {
-      // We are being conservative here.  We only require this work-around if
-      // CurrentSubEntries > 3 &&
-      // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
-      //
-      // We have to be conservative, because we don't know for certain that
-      // our stack allocation algorithm for Evergreen/NI is correct.  Applying this
-      // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
-      // resources without any problems.
-      return CurrentSubEntries > 3;
-    } else {
-      assert(ST->getWavefrontSize() == 32);
-      // We are being conservative here.  We only require the work-around if
-      // CurrentSubEntries > 7 &&
-      // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
-      // See the comment on the wavefront size == 64 case for why we are
-      // being conservative.
-      return CurrentSubEntries > 7;
-    }
-  }
-}
-
-unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
-  switch(Item) {
-  default:
-    return 0;
-  case CFStack::FIRST_NON_WQM_PUSH:
-  assert(!ST->hasCaymanISA());
-  if (ST->getGeneration() <= AMDGPUSubtarget::R700) {
-    // +1 For the push operation.
-    // +2 Extra space required.
-    return 3;
-  } else {
-    // Some documentation says that this is not necessary on Evergreen,
-    // but experimentation has show that we need to allocate 1 extra
-    // sub-entry for the first non-WQM push.
-    // +1 For the push operation.
-    // +1 Extra space required.
-    return 2;
-  }
-  case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
-    assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
-    // +1 For the push operation.
-    // +1 Extra space required.
-    return 2;
-  case CFStack::SUB_ENTRY:
-    return 1;
-  }
-}
-
-void CFStack::updateMaxStackSize() {
-  unsigned CurrentStackSize = CurrentEntries +
-                              (RoundUpToAlignment(CurrentSubEntries, 4) / 4);
-  MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
-}
-
-void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
-  CFStack::StackItem Item = CFStack::ENTRY;
-  switch(Opcode) {
-  case AMDGPU::CF_PUSH_EG:
-  case AMDGPU::CF_ALU_PUSH_BEFORE:
-    if (!isWQM) {
-      if (!ST->hasCaymanISA() &&
-          !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
-        Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
-                                             // See comment in
-                                             // CFStack::getSubEntrySize()
-      else if (CurrentEntries > 0 &&
-               ST->getGeneration() > AMDGPUSubtarget::EVERGREEN &&
-               !ST->hasCaymanISA() &&
-               !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
-        Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
-      else
-        Item = CFStack::SUB_ENTRY;
-    } else
-      Item = CFStack::ENTRY;
-    break;
-  }
-  BranchStack.push_back(Item);
-  if (Item == CFStack::ENTRY)
-    CurrentEntries++;
-  else
-    CurrentSubEntries += getSubEntrySize(Item);
-  updateMaxStackSize();
-}
-
-void CFStack::pushLoop() {
-  LoopStack.push_back(CFStack::ENTRY);
-  CurrentEntries++;
-  updateMaxStackSize();
-}
-
-void CFStack::popBranch() {
-  CFStack::StackItem Top = BranchStack.back();
-  if (Top == CFStack::ENTRY)
-    CurrentEntries--;
-  else
-    CurrentSubEntries-= getSubEntrySize(Top);
-  BranchStack.pop_back();
-}
-
-void CFStack::popLoop() {
-  CurrentEntries--;
-  LoopStack.pop_back();
-}
-
-class R600ControlFlowFinalizer : public MachineFunctionPass {
-
-private:
-  typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
-
-  enum ControlFlowInstruction {
-    CF_TC,
-    CF_VC,
-    CF_CALL_FS,
-    CF_WHILE_LOOP,
-    CF_END_LOOP,
-    CF_LOOP_BREAK,
-    CF_LOOP_CONTINUE,
-    CF_JUMP,
-    CF_ELSE,
-    CF_POP,
-    CF_END
-  };
-
-  static char ID;
-  const R600InstrInfo *TII;
-  const R600RegisterInfo *TRI;
-  unsigned MaxFetchInst;
-  const AMDGPUSubtarget *ST;
-
-  bool IsTrivialInst(MachineInstr *MI) const {
-    switch (MI->getOpcode()) {
-    case AMDGPU::KILL:
-    case AMDGPU::RETURN:
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
-    unsigned Opcode = 0;
-    bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
-    switch (CFI) {
-    case CF_TC:
-      Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
-      break;
-    case CF_VC:
-      Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
-      break;
-    case CF_CALL_FS:
-      Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
-      break;
-    case CF_WHILE_LOOP:
-      Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
-      break;
-    case CF_END_LOOP:
-      Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
-      break;
-    case CF_LOOP_BREAK:
-      Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
-      break;
-    case CF_LOOP_CONTINUE:
-      Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
-      break;
-    case CF_JUMP:
-      Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
-      break;
-    case CF_ELSE:
-      Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
-      break;
-    case CF_POP:
-      Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
-      break;
-    case CF_END:
-      if (ST->hasCaymanISA()) {
-        Opcode = AMDGPU::CF_END_CM;
-        break;
-      }
-      Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
-      break;
-    }
-    assert (Opcode && "No opcode selected");
-    return TII->get(Opcode);
-  }
-
-  bool isCompatibleWithClause(const MachineInstr *MI,
-      std::set<unsigned> &DstRegs) const {
-    unsigned DstMI, SrcMI;
-    for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
-        E = MI->operands_end(); I != E; ++I) {
-      const MachineOperand &MO = *I;
-      if (!MO.isReg())
-        continue;
-      if (MO.isDef()) {
-        unsigned Reg = MO.getReg();
-        if (AMDGPU::R600_Reg128RegClass.contains(Reg))
-          DstMI = Reg;
-        else
-          DstMI = TRI->getMatchingSuperReg(Reg,
-              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
-              &AMDGPU::R600_Reg128RegClass);
-      }
-      if (MO.isUse()) {
-        unsigned Reg = MO.getReg();
-        if (AMDGPU::R600_Reg128RegClass.contains(Reg))
-          SrcMI = Reg;
-        else
-          SrcMI = TRI->getMatchingSuperReg(Reg,
-              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
-              &AMDGPU::R600_Reg128RegClass);
-      }
-    }
-    if ((DstRegs.find(SrcMI) == DstRegs.end())) {
-      DstRegs.insert(DstMI);
-      return true;
-    } else
-      return false;
-  }
-
-  ClauseFile
-  MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
-      const {
-    MachineBasicBlock::iterator ClauseHead = I;
-    std::vector<MachineInstr *> ClauseContent;
-    unsigned AluInstCount = 0;
-    bool IsTex = TII->usesTextureCache(ClauseHead);
-    std::set<unsigned> DstRegs;
-    for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
-      if (IsTrivialInst(I))
-        continue;
-      if (AluInstCount >= MaxFetchInst)
-        break;
-      if ((IsTex && !TII->usesTextureCache(I)) ||
-          (!IsTex && !TII->usesVertexCache(I)))
-        break;
-      if (!isCompatibleWithClause(I, DstRegs))
-        break;
-      AluInstCount ++;
-      ClauseContent.push_back(I);
-    }
-    MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
-        getHWInstrDesc(IsTex?CF_TC:CF_VC))
-        .addImm(0) // ADDR
-        .addImm(AluInstCount - 1); // COUNT
-    return ClauseFile(MIb, std::move(ClauseContent));
-  }
-
-  void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
-    static const unsigned LiteralRegs[] = {
-      AMDGPU::ALU_LITERAL_X,
-      AMDGPU::ALU_LITERAL_Y,
-      AMDGPU::ALU_LITERAL_Z,
-      AMDGPU::ALU_LITERAL_W
-    };
-    const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs =
-        TII->getSrcs(MI);
-    for (unsigned i = 0, e = Srcs.size(); i < e; ++i) {
-      if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
-        continue;
-      int64_t Imm = Srcs[i].second;
-      std::vector<int64_t>::iterator It =
-          std::find(Lits.begin(), Lits.end(), Imm);
-      if (It != Lits.end()) {
-        unsigned Index = It - Lits.begin();
-        Srcs[i].first->setReg(LiteralRegs[Index]);
-      } else {
-        assert(Lits.size() < 4 && "Too many literals in Instruction Group");
-        Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
-        Lits.push_back(Imm);
-      }
-    }
-  }
-
-  MachineBasicBlock::iterator insertLiterals(
-      MachineBasicBlock::iterator InsertPos,
-      const std::vector<unsigned> &Literals) const {
-    MachineBasicBlock *MBB = InsertPos->getParent();
-    for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
-      unsigned LiteralPair0 = Literals[i];
-      unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
-      InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
-          TII->get(AMDGPU::LITERALS))
-          .addImm(LiteralPair0)
-          .addImm(LiteralPair1);
-    }
-    return InsertPos;
-  }
-
-  ClauseFile
-  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
-      const {
-    MachineBasicBlock::iterator ClauseHead = I;
-    std::vector<MachineInstr *> ClauseContent;
-    I++;
-    for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
-      if (IsTrivialInst(I)) {
-        ++I;
-        continue;
-      }
-      if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
-        break;
-      std::vector<int64_t> Literals;
-      if (I->isBundle()) {
-        MachineInstr *DeleteMI = I;
-        MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
-        while (++BI != E && BI->isBundledWithPred()) {
-          BI->unbundleFromPred();
-          for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
-            MachineOperand &MO = BI->getOperand(i);
-            if (MO.isReg() && MO.isInternalRead())
-              MO.setIsInternalRead(false);
-          }
-          getLiteral(BI, Literals);
-          ClauseContent.push_back(BI);
-        }
-        I = BI;
-        DeleteMI->eraseFromParent();
-      } else {
-        getLiteral(I, Literals);
-        ClauseContent.push_back(I);
-        I++;
-      }
-      for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
-        unsigned literal0 = Literals[i];
-        unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
-        MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(),
-            TII->get(AMDGPU::LITERALS))
-            .addImm(literal0)
-            .addImm(literal2);
-        ClauseContent.push_back(MILit);
-      }
-    }
-    assert(ClauseContent.size() < 128 && "ALU clause is too big");
-    ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
-    return ClauseFile(ClauseHead, std::move(ClauseContent));
-  }
-
-  void
-  EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
-      unsigned &CfCount) {
-    CounterPropagateAddr(Clause.first, CfCount);
-    MachineBasicBlock *BB = Clause.first->getParent();
-    BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
-        .addImm(CfCount);
-    for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
-      BB->splice(InsertPos, BB, Clause.second[i]);
-    }
-    CfCount += 2 * Clause.second.size();
-  }
-
-  void
-  EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
-      unsigned &CfCount) {
-    Clause.first->getOperand(0).setImm(0);
-    CounterPropagateAddr(Clause.first, CfCount);
-    MachineBasicBlock *BB = Clause.first->getParent();
-    BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
-        .addImm(CfCount);
-    for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
-      BB->splice(InsertPos, BB, Clause.second[i]);
-    }
-    CfCount += Clause.second.size();
-  }
-
-  void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
-    MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
-  }
-  void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
-                            unsigned Addr) const {
-    for (MachineInstr *MI : MIs) {
-      CounterPropagateAddr(MI, Addr);
-    }
-  }
-
-public:
-  R600ControlFlowFinalizer(TargetMachine &tm)
-      : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {}
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    ST = &MF.getSubtarget<AMDGPUSubtarget>();
-    MaxFetchInst = ST->getTexVTXClauseSize();
-    TII = static_cast<const R600InstrInfo *>(ST->getInstrInfo());
-    TRI = static_cast<const R600RegisterInfo *>(ST->getRegisterInfo());
-    R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-
-    CFStack CFStack(ST, MFI->getShaderType());
-    for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
-        ++MB) {
-      MachineBasicBlock &MBB = *MB;
-      unsigned CfCount = 0;
-      std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
-      std::vector<MachineInstr * > IfThenElseStack;
-      if (MFI->getShaderType() == ShaderType::VERTEX) {
-        BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
-            getHWInstrDesc(CF_CALL_FS));
-        CfCount++;
-      }
-      std::vector<ClauseFile> FetchClauses, AluClauses;
-      std::vector<MachineInstr *> LastAlu(1);
-      std::vector<MachineInstr *> ToPopAfter;
-      
-      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-          I != E;) {
-        if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
-          DEBUG(dbgs() << CfCount << ":"; I->dump(););
-          FetchClauses.push_back(MakeFetchClause(MBB, I));
-          CfCount++;
-          LastAlu.back() = nullptr;
-          continue;
-        }
-
-        MachineBasicBlock::iterator MI = I;
-        if (MI->getOpcode() != AMDGPU::ENDIF)
-          LastAlu.back() = nullptr;
-        if (MI->getOpcode() == AMDGPU::CF_ALU)
-          LastAlu.back() = MI;
-        I++;
-        bool RequiresWorkAround =
-            CFStack.requiresWorkAroundForInst(MI->getOpcode());
-        switch (MI->getOpcode()) {
-        case AMDGPU::CF_ALU_PUSH_BEFORE:
-          if (RequiresWorkAround) {
-            DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
-            BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
-                .addImm(CfCount + 1)
-                .addImm(1);
-            MI->setDesc(TII->get(AMDGPU::CF_ALU));
-            CfCount++;
-            CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
-          } else
-            CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
-
-        case AMDGPU::CF_ALU:
-          I = MI;
-          AluClauses.push_back(MakeALUClause(MBB, I));
-          DEBUG(dbgs() << CfCount << ":"; MI->dump(););
-          CfCount++;
-          break;
-        case AMDGPU::WHILELOOP: {
-          CFStack.pushLoop();
-          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
-              getHWInstrDesc(CF_WHILE_LOOP))
-              .addImm(1);
-          std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
-              std::set<MachineInstr *>());
-          Pair.second.insert(MIb);
-          LoopStack.push_back(std::move(Pair));
-          MI->eraseFromParent();
-          CfCount++;
-          break;
-        }
-        case AMDGPU::ENDLOOP: {
-          CFStack.popLoop();
-          std::pair<unsigned, std::set<MachineInstr *> > Pair =
-              std::move(LoopStack.back());
-          LoopStack.pop_back();
-          CounterPropagateAddr(Pair.second, CfCount);
-          BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
-              .addImm(Pair.first + 1);
-          MI->eraseFromParent();
-          CfCount++;
-          break;
-        }
-        case AMDGPU::IF_PREDICATE_SET: {
-          LastAlu.push_back(nullptr);
-          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
-              getHWInstrDesc(CF_JUMP))
-              .addImm(0)
-              .addImm(0);
-          IfThenElseStack.push_back(MIb);
-          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
-          MI->eraseFromParent();
-          CfCount++;
-          break;
-        }
-        case AMDGPU::ELSE: {
-          MachineInstr * JumpInst = IfThenElseStack.back();
-          IfThenElseStack.pop_back();
-          CounterPropagateAddr(JumpInst, CfCount);
-          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
-              getHWInstrDesc(CF_ELSE))
-              .addImm(0)
-              .addImm(0);
-          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
-          IfThenElseStack.push_back(MIb);
-          MI->eraseFromParent();
-          CfCount++;
-          break;
-        }
-        case AMDGPU::ENDIF: {
-          CFStack.popBranch();
-          if (LastAlu.back()) {
-            ToPopAfter.push_back(LastAlu.back());
-          } else {
-            MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
-                getHWInstrDesc(CF_POP))
-                .addImm(CfCount + 1)
-                .addImm(1);
-            (void)MIb;
-            DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
-            CfCount++;
-          }
-          
-          MachineInstr *IfOrElseInst = IfThenElseStack.back();
-          IfThenElseStack.pop_back();
-          CounterPropagateAddr(IfOrElseInst, CfCount);
-          IfOrElseInst->getOperand(1).setImm(1);
-          LastAlu.pop_back();
-          MI->eraseFromParent();
-          break;
-        }
-        case AMDGPU::BREAK: {
-          CfCount ++;
-          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
-              getHWInstrDesc(CF_LOOP_BREAK))
-              .addImm(0);
-          LoopStack.back().second.insert(MIb);
-          MI->eraseFromParent();
-          break;
-        }
-        case AMDGPU::CONTINUE: {
-          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
-              getHWInstrDesc(CF_LOOP_CONTINUE))
-              .addImm(0);
-          LoopStack.back().second.insert(MIb);
-          MI->eraseFromParent();
-          CfCount++;
-          break;
-        }
-        case AMDGPU::RETURN: {
-          BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
-          CfCount++;
-          MI->eraseFromParent();
-          if (CfCount % 2) {
-            BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
-            CfCount++;
-          }
-          for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
-            EmitFetchClause(I, FetchClauses[i], CfCount);
-          for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
-            EmitALUClause(I, AluClauses[i], CfCount);
-        }
-        default:
-          if (TII->isExport(MI->getOpcode())) {
-            DEBUG(dbgs() << CfCount << ":"; MI->dump(););
-            CfCount++;
-          }
-          break;
-        }
-      }
-      for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
-        MachineInstr *Alu = ToPopAfter[i];
-        BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
-            TII->get(AMDGPU::CF_ALU_POP_AFTER))
-            .addImm(Alu->getOperand(0).getImm())
-            .addImm(Alu->getOperand(1).getImm())
-            .addImm(Alu->getOperand(2).getImm())
-            .addImm(Alu->getOperand(3).getImm())
-            .addImm(Alu->getOperand(4).getImm())
-            .addImm(Alu->getOperand(5).getImm())
-            .addImm(Alu->getOperand(6).getImm())
-            .addImm(Alu->getOperand(7).getImm())
-            .addImm(Alu->getOperand(8).getImm());
-        Alu->eraseFromParent();
-      }
-      MFI->StackSize = CFStack.MaxStackSize;
-    }
-
-    return false;
-  }
-
-  const char *getPassName() const override {
-    return "R600 Control Flow Finalizer Pass";
-  }
-};
-
-char R600ControlFlowFinalizer::ID = 0;
-
-} // end anonymous namespace
-
-
-llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
-  return new R600ControlFlowFinalizer(TM);
-}
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
deleted file mode 100644
index 51d87eda31d..00000000000
--- a/lib/Target/R600/R600Defines.h
+++ /dev/null
@@ -1,171 +0,0 @@
-//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H
-#define LLVM_LIB_TARGET_R600_R600DEFINES_H
-
-#include "llvm/MC/MCRegisterInfo.h"
-
-// Operand Flags
-#define MO_FLAG_CLAMP (1 << 0)
-#define MO_FLAG_NEG   (1 << 1)
-#define MO_FLAG_ABS   (1 << 2)
-#define MO_FLAG_MASK  (1 << 3)
-#define MO_FLAG_PUSH  (1 << 4)
-#define MO_FLAG_NOT_LAST  (1 << 5)
-#define MO_FLAG_LAST  (1 << 6)
-#define NUM_MO_FLAGS 7
-
-/// \brief Helper for getting the operand index for the instruction flags
-/// operand.
-#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3)
-
-namespace R600_InstFlag {
-  enum TIF {
-    TRANS_ONLY = (1 << 0),
-    TEX = (1 << 1),
-    REDUCTION = (1 << 2),
-    FC = (1 << 3),
-    TRIG = (1 << 4),
-    OP3 = (1 << 5),
-    VECTOR = (1 << 6),
-    //FlagOperand bits 7, 8
-    NATIVE_OPERANDS = (1 << 9),
-    OP1 = (1 << 10),
-    OP2 = (1 << 11),
-    VTX_INST  = (1 << 12),
-    TEX_INST = (1 << 13),
-    ALU_INST = (1 << 14),
-    LDS_1A = (1 << 15),
-    LDS_1A1D = (1 << 16),
-    IS_EXPORT = (1 << 17),
-    LDS_1A2D = (1 << 18)
-  };
-}
-
-#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
-
-/// \brief Defines for extracting register information from register encoding
-#define HW_REG_MASK 0x1ff
-#define HW_CHAN_SHIFT 9
-
-#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT)
-#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK)
-
-#define IS_VTX(desc) ((desc).TSFlags & R600_InstFlag::VTX_INST)
-#define IS_TEX(desc) ((desc).TSFlags & R600_InstFlag::TEX_INST)
-
-namespace OpName {
-
-  enum VecOps {
-    UPDATE_EXEC_MASK_X,
-    UPDATE_PREDICATE_X,
-    WRITE_X,
-    OMOD_X,
-    DST_REL_X,
-    CLAMP_X,
-    SRC0_X,
-    SRC0_NEG_X,
-    SRC0_REL_X,
-    SRC0_ABS_X,
-    SRC0_SEL_X,
-    SRC1_X,
-    SRC1_NEG_X,
-    SRC1_REL_X,
-    SRC1_ABS_X,
-    SRC1_SEL_X,
-    PRED_SEL_X,
-    UPDATE_EXEC_MASK_Y,
-    UPDATE_PREDICATE_Y,
-    WRITE_Y,
-    OMOD_Y,
-    DST_REL_Y,
-    CLAMP_Y,
-    SRC0_Y,
-    SRC0_NEG_Y,
-    SRC0_REL_Y,
-    SRC0_ABS_Y,
-    SRC0_SEL_Y,
-    SRC1_Y,
-    SRC1_NEG_Y,
-    SRC1_REL_Y,
-    SRC1_ABS_Y,
-    SRC1_SEL_Y,
-    PRED_SEL_Y,
-    UPDATE_EXEC_MASK_Z,
-    UPDATE_PREDICATE_Z,
-    WRITE_Z,
-    OMOD_Z,
-    DST_REL_Z,
-    CLAMP_Z,
-    SRC0_Z,
-    SRC0_NEG_Z,
-    SRC0_REL_Z,
-    SRC0_ABS_Z,
-    SRC0_SEL_Z,
-    SRC1_Z,
-    SRC1_NEG_Z,
-    SRC1_REL_Z,
-    SRC1_ABS_Z,
-    SRC1_SEL_Z,
-    PRED_SEL_Z,
-    UPDATE_EXEC_MASK_W,
-    UPDATE_PREDICATE_W,
-    WRITE_W,
-    OMOD_W,
-    DST_REL_W,
-    CLAMP_W,
-    SRC0_W,
-    SRC0_NEG_W,
-    SRC0_REL_W,
-    SRC0_ABS_W,
-    SRC0_SEL_W,
-    SRC1_W,
-    SRC1_NEG_W,
-    SRC1_REL_W,
-    SRC1_ABS_W,
-    SRC1_SEL_W,
-    PRED_SEL_W,
-    IMM_0,
-    IMM_1,
-    VEC_COUNT
- };
-
-}
-
-//===----------------------------------------------------------------------===//
-// Config register definitions
-//===----------------------------------------------------------------------===//
-
-#define R_02880C_DB_SHADER_CONTROL                    0x02880C
-#define   S_02880C_KILL_ENABLE(x)                      (((x) & 0x1) << 6)
-
-// These fields are the same for all shader types and families.
-#define   S_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
-#define   S_STACK_SIZE(x)                       (((x) & 0xFF) << 8)
-//===----------------------------------------------------------------------===//
-// R600, R700 Registers
-//===----------------------------------------------------------------------===//
-
-#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
-#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
-
-//===----------------------------------------------------------------------===//
-// Evergreen, Northern Islands Registers
-//===----------------------------------------------------------------------===//
-
-#define R_028844_SQ_PGM_RESOURCES_PS                 0x028844
-#define R_028860_SQ_PGM_RESOURCES_VS                 0x028860
-#define R_028878_SQ_PGM_RESOURCES_GS                 0x028878
-#define R_0288D4_SQ_PGM_RESOURCES_LS                 0x0288d4
-
-#define R_0288E8_SQ_LDS_ALLOC                        0x0288E8
-
-#endif
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
deleted file mode 100644
index fdc20302f4a..00000000000
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold
-/// 128 Alu instructions ; these instructions can access up to 4 prefetched
-/// 4 lines of 16 registers from constant buffers. Such ALU clauses are
-/// initiated by CF_ALU instructions.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "R600RegisterInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
-using namespace llvm;
-
-namespace llvm {
-  void initializeR600EmitClauseMarkersPass(PassRegistry&);
-}
-
-namespace {
-
-class R600EmitClauseMarkers : public MachineFunctionPass {
-
-private:
-  const R600InstrInfo *TII;
-  int Address;
-
-  unsigned OccupiedDwords(MachineInstr *MI) const {
-    switch (MI->getOpcode()) {
-    case AMDGPU::INTERP_PAIR_XY:
-    case AMDGPU::INTERP_PAIR_ZW:
-    case AMDGPU::INTERP_VEC_LOAD:
-    case AMDGPU::DOT_4:
-      return 4;
-    case AMDGPU::KILL:
-      return 0;
-    default:
-      break;
-    }
-
-    // These will be expanded to two ALU instructions in the
-    // ExpandSpecialInstructions pass.
-    if (TII->isLDSRetInstr(MI->getOpcode()))
-      return 2;
-
-    if(TII->isVector(*MI) ||
-        TII->isCubeOp(MI->getOpcode()) ||
-        TII->isReductionOp(MI->getOpcode()))
-      return 4;
-
-    unsigned NumLiteral = 0;
-    for (MachineInstr::mop_iterator It = MI->operands_begin(),
-        E = MI->operands_end(); It != E; ++It) {
-      MachineOperand &MO = *It;
-      if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
-        ++NumLiteral;
-    }
-    return 1 + NumLiteral;
-  }
-
-  bool isALU(const MachineInstr *MI) const {
-    if (TII->isALUInstr(MI->getOpcode()))
-      return true;
-    if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode()))
-      return true;
-    switch (MI->getOpcode()) {
-    case AMDGPU::PRED_X:
-    case AMDGPU::INTERP_PAIR_XY:
-    case AMDGPU::INTERP_PAIR_ZW:
-    case AMDGPU::INTERP_VEC_LOAD:
-    case AMDGPU::COPY:
-    case AMDGPU::DOT_4:
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  bool IsTrivialInst(MachineInstr *MI) const {
-    switch (MI->getOpcode()) {
-    case AMDGPU::KILL:
-    case AMDGPU::RETURN:
-    case AMDGPU::IMPLICIT_DEF:
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const {
-    // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2
-    // (See also R600ISelLowering.cpp)
-    // ConstIndex value is in [0, 4095];
-    return std::pair<unsigned, unsigned>(
-        ((Sel >> 2) - 512) >> 12, // KC_BANK
-        // Line Number of ConstIndex
-        // A line contains 16 constant registers however KCX bank can lock
-        // two line at the same time ; thus we want to get an even line number.
-        // Line number can be retrieved with (>>4), using (>>5) <<1 generates
-        // an even number.
-        ((((Sel >> 2) - 512) & 4095) >> 5) << 1);
-  }
-
-  bool SubstituteKCacheBank(MachineInstr *MI,
-      std::vector<std::pair<unsigned, unsigned> > &CachedConsts,
-      bool UpdateInstr = true) const {
-    std::vector<std::pair<unsigned, unsigned> > UsedKCache;
-
-    if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4)
-      return true;
-
-    const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts =
-        TII->getSrcs(MI);
-    assert((TII->isALUInstr(MI->getOpcode()) ||
-        MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const");
-    for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
-      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
-        continue;
-      unsigned Sel = Consts[i].second;
-      unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
-      unsigned KCacheIndex = Index * 4 + Chan;
-      const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel);
-      if (CachedConsts.empty()) {
-        CachedConsts.push_back(BankLine);
-        UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
-        continue;
-      }
-      if (CachedConsts[0] == BankLine) {
-        UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
-        continue;
-      }
-      if (CachedConsts.size() == 1) {
-        CachedConsts.push_back(BankLine);
-        UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
-        continue;
-      }
-      if (CachedConsts[1] == BankLine) {
-        UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
-        continue;
-      }
-      return false;
-    }
-
-    if (!UpdateInstr)
-      return true;
-
-    for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
-      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
-        continue;
-      switch(UsedKCache[j].first) {
-      case 0:
-        Consts[i].first->setReg(
-            AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second));
-        break;
-      case 1:
-        Consts[i].first->setReg(
-            AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second));
-        break;
-      default:
-        llvm_unreachable("Wrong Cache Line");
-      }
-      j++;
-    }
-    return true;
-  }
-
-  bool canClauseLocalKillFitInClause(
-                        unsigned AluInstCount,
-                        std::vector<std::pair<unsigned, unsigned> > KCacheBanks,
-                        MachineBasicBlock::iterator Def,
-                        MachineBasicBlock::iterator BBEnd) {
-    const R600RegisterInfo &TRI = TII->getRegisterInfo();
-    for (MachineInstr::const_mop_iterator
-           MOI = Def->operands_begin(),
-           MOE = Def->operands_end(); MOI != MOE; ++MOI) {
-      if (!MOI->isReg() || !MOI->isDef() ||
-          TRI.isPhysRegLiveAcrossClauses(MOI->getReg()))
-        continue;
-
-      // Def defines a clause local register, so check that its use will fit
-      // in the clause.
-      unsigned LastUseCount = 0;
-      for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) {
-        AluInstCount += OccupiedDwords(UseI);
-        // Make sure we won't need to end the clause due to KCache limitations.
-        if (!SubstituteKCacheBank(UseI, KCacheBanks, false))
-          return false;
-
-        // We have reached the maximum instruction limit before finding the
-        // use that kills this register, so we cannot use this def in the
-        // current clause.
-        if (AluInstCount >= TII->getMaxAlusPerClause())
-          return false;
-
-        // Register kill flags have been cleared by the time we get to this
-        // pass, but it is safe to assume that all uses of this register
-        // occur in the same basic block as its definition, because
-        // it is illegal for the scheduler to schedule them in
-        // different blocks.
-        if (UseI->findRegisterUseOperandIdx(MOI->getReg()))
-          LastUseCount = AluInstCount;
-
-        if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1)
-          break;
-      }
-      if (LastUseCount)
-        return LastUseCount <= TII->getMaxAlusPerClause();
-      llvm_unreachable("Clause local register live at end of clause.");
-    }
-    return true;
-  }
-
-  MachineBasicBlock::iterator
-  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
-    MachineBasicBlock::iterator ClauseHead = I;
-    std::vector<std::pair<unsigned, unsigned> > KCacheBanks;
-    bool PushBeforeModifier = false;
-    unsigned AluInstCount = 0;
-    for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
-      if (IsTrivialInst(I))
-        continue;
-      if (!isALU(I))
-        break;
-      if (AluInstCount > TII->getMaxAlusPerClause())
-        break;
-      if (I->getOpcode() == AMDGPU::PRED_X) {
-        // We put PRED_X in its own clause to ensure that ifcvt won't create
-        // clauses with more than 128 insts.
-        // IfCvt is indeed checking that "then" and "else" branches of an if
-        // statement have less than ~60 insts thus converted clauses can't be
-        // bigger than ~121 insts (predicate setter needs to be in the same
-        // clause as predicated alus).
-        if (AluInstCount > 0)
-          break;
-        if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH)
-          PushBeforeModifier = true;
-        AluInstCount ++;
-        continue;
-      }
-      // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as:
-      //
-      // * KILL or INTERP instructions
-      // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits
-      // * Uses waterfalling (i.e. INDEX_MODE = AR.X)
-      //
-      // XXX: These checks have not been implemented yet.
-      if (TII->mustBeLastInClause(I->getOpcode())) {
-        I++;
-        break;
-      }
-
-      // If this instruction defines a clause local register, make sure
-      // its use can fit in this clause.
-      if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E))
-        break;
-
-      if (!SubstituteKCacheBank(I, KCacheBanks))
-        break;
-      AluInstCount += OccupiedDwords(I);
-    }
-    unsigned Opcode = PushBeforeModifier ?
-        AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
-    BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
-    // We don't use the ADDR field until R600ControlFlowFinalizer pass, where
-    // it is safe to assume it is 0. However if we always put 0 here, the ifcvt
-    // pass may assume that identical ALU clause starter at the beginning of a 
-    // true and false branch can be factorized which is not the case.
-        .addImm(Address++) // ADDR
-        .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0
-        .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1
-        .addImm(KCacheBanks.empty()?0:2) // KM0
-        .addImm((KCacheBanks.size() < 2)?0:2) // KM1
-        .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0
-        .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1
-        .addImm(AluInstCount) // COUNT
-        .addImm(1); // Enabled
-    return I;
-  }
-
-public:
-  static char ID;
-  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) {
-
-    initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
-
-    for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                    BB != BB_E; ++BB) {
-      MachineBasicBlock &MBB = *BB;
-      MachineBasicBlock::iterator I = MBB.begin();
-      if (I->getOpcode() == AMDGPU::CF_ALU)
-        continue; // BB was already parsed
-      for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
-        if (isALU(I))
-          I = MakeALUClause(MBB, I);
-        else
-          ++I;
-      }
-    }
-    return false;
-  }
-
-  const char *getPassName() const override {
-    return "R600 Emit Clause Markers Pass";
-  }
-};
-
-char R600EmitClauseMarkers::ID = 0;
-
-} // end anonymous namespace
-
-INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers",
-                      "R600 Emit Clause Markters", false, false)
-INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers",
-                      "R600 Emit Clause Markters", false, false)
-
-llvm::FunctionPass *llvm::createR600EmitClauseMarkers() {
-  return new R600EmitClauseMarkers();
-}
-
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
deleted file mode 100644
index 211d392e8fc..00000000000
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ /dev/null
@@ -1,349 +0,0 @@
-//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Vector, Reduction, and Cube instructions need to fill the entire instruction
-/// group to work correctly.  This pass expands these individual instructions
-/// into several instructions that will completely fill the instruction group.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "R600RegisterInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
-using namespace llvm;
-
-namespace {
-
-class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
-
-private:
-  static char ID;
-  const R600InstrInfo *TII;
-
-  void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI,
-      unsigned Op);
-
-public:
-  R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII(nullptr) { }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "R600 Expand special instructions pass";
-  }
-};
-
-} // End anonymous namespace
-
-char R600ExpandSpecialInstrsPass::ID = 0;
-
-FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
-  return new R600ExpandSpecialInstrsPass(TM);
-}
-
-void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
-    const MachineInstr *OldMI, unsigned Op) {
-  int OpIdx = TII->getOperandIdx(*OldMI, Op);
-  if (OpIdx > -1) {
-    uint64_t Val = OldMI->getOperand(OpIdx).getImm();
-    TII->setImmOperand(NewMI, Op, Val);
-  }
-}
-
-bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
-
-  const R600RegisterInfo &TRI = TII->getRegisterInfo();
-
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-    MachineBasicBlock::iterator I = MBB.begin();
-    while (I != MBB.end()) {
-      MachineInstr &MI = *I;
-      I = std::next(I);
-
-      // Expand LDS_*_RET instructions
-      if (TII->isLDSRetInstr(MI.getOpcode())) {
-        int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
-        assert(DstIdx != -1);
-        MachineOperand &DstOp = MI.getOperand(DstIdx);
-        MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
-                                               DstOp.getReg(), AMDGPU::OQAP);
-        DstOp.setReg(AMDGPU::OQAP);
-        int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
-                                           AMDGPU::OpName::pred_sel);
-        int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
-                                           AMDGPU::OpName::pred_sel);
-        // Copy the pred_sel bit
-        Mov->getOperand(MovPredSelIdx).setReg(
-            MI.getOperand(LDSPredSelIdx).getReg());
-      }
-
-      switch (MI.getOpcode()) {
-      default: break;
-      // Expand PRED_X to one of the PRED_SET instructions.
-      case AMDGPU::PRED_X: {
-        uint64_t Flags = MI.getOperand(3).getImm();
-        // The native opcode used by PRED_X is stored as an immediate in the
-        // third operand.
-        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
-                                            MI.getOperand(2).getImm(), // opcode
-                                            MI.getOperand(0).getReg(), // dst
-                                            MI.getOperand(1).getReg(), // src0
-                                            AMDGPU::ZERO);             // src1
-        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
-        if (Flags & MO_FLAG_PUSH) {
-          TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1);
-        } else {
-          TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1);
-        }
-        MI.eraseFromParent();
-        continue;
-        }
-
-      case AMDGPU::INTERP_PAIR_XY: {
-        MachineInstr *BMI;
-        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
-                MI.getOperand(2).getImm());
-
-        for (unsigned Chan = 0; Chan < 4; ++Chan) {
-          unsigned DstReg;
-
-          if (Chan < 2)
-            DstReg = MI.getOperand(Chan).getReg();
-          else
-            DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W;
-
-          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY,
-              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
-
-          if (Chan > 0) {
-            BMI->bundleWithPred();
-          }
-          if (Chan >= 2)
-            TII->addFlag(BMI, 0, MO_FLAG_MASK);
-          if (Chan != 3)
-            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
-        }
-
-        MI.eraseFromParent();
-        continue;
-        }
-
-      case AMDGPU::INTERP_PAIR_ZW: {
-        MachineInstr *BMI;
-        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
-                MI.getOperand(2).getImm());
-
-        for (unsigned Chan = 0; Chan < 4; ++Chan) {
-          unsigned DstReg;
-
-          if (Chan < 2)
-            DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y;
-          else
-            DstReg = MI.getOperand(Chan-2).getReg();
-
-          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW,
-              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
-
-          if (Chan > 0) {
-            BMI->bundleWithPred();
-          }
-          if (Chan < 2)
-            TII->addFlag(BMI, 0, MO_FLAG_MASK);
-          if (Chan != 3)
-            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
-        }
-
-        MI.eraseFromParent();
-        continue;
-        }
-
-      case AMDGPU::INTERP_VEC_LOAD: {
-        const R600RegisterInfo &TRI = TII->getRegisterInfo();
-        MachineInstr *BMI;
-        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
-                MI.getOperand(1).getImm());
-        unsigned DstReg = MI.getOperand(0).getReg();
-
-        for (unsigned Chan = 0; Chan < 4; ++Chan) {
-          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0,
-              TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg);
-          if (Chan > 0) {
-            BMI->bundleWithPred();
-          }
-          if (Chan != 3)
-            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
-        }
-
-        MI.eraseFromParent();
-        continue;
-        }
-      case AMDGPU::DOT_4: {
-
-        const R600RegisterInfo &TRI = TII->getRegisterInfo();
-
-        unsigned DstReg = MI.getOperand(0).getReg();
-        unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
-
-        for (unsigned Chan = 0; Chan < 4; ++Chan) {
-          bool Mask = (Chan != TRI.getHWRegChan(DstReg));
-          unsigned SubDstReg =
-              AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
-          MachineInstr *BMI =
-              TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
-          if (Chan > 0) {
-            BMI->bundleWithPred();
-          }
-          if (Mask) {
-            TII->addFlag(BMI, 0, MO_FLAG_MASK);
-          }
-          if (Chan != 3)
-            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
-          unsigned Opcode = BMI->getOpcode();
-          // While not strictly necessary from hw point of view, we force
-          // all src operands of a dot4 inst to belong to the same slot.
-          unsigned Src0 = BMI->getOperand(
-              TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
-              .getReg();
-          unsigned Src1 = BMI->getOperand(
-              TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
-              .getReg();
-          (void) Src0;
-          (void) Src1;
-          if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
-              (TRI.getEncodingValue(Src1) & 0xff) < 127)
-            assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
-        }
-        MI.eraseFromParent();
-        continue;
-      }
-      }
-
-      bool IsReduction = TII->isReductionOp(MI.getOpcode());
-      bool IsVector = TII->isVector(MI);
-      bool IsCube = TII->isCubeOp(MI.getOpcode());
-      if (!IsReduction && !IsVector && !IsCube) {
-        continue;
-      }
-
-      // Expand the instruction
-      //
-      // Reduction instructions:
-      // T0_X = DP4 T1_XYZW, T2_XYZW
-      // becomes:
-      // TO_X = DP4 T1_X, T2_X
-      // TO_Y (write masked) = DP4 T1_Y, T2_Y
-      // TO_Z (write masked) = DP4 T1_Z, T2_Z
-      // TO_W (write masked) = DP4 T1_W, T2_W
-      //
-      // Vector instructions:
-      // T0_X = MULLO_INT T1_X, T2_X
-      // becomes:
-      // T0_X = MULLO_INT T1_X, T2_X
-      // T0_Y (write masked) = MULLO_INT T1_X, T2_X
-      // T0_Z (write masked) = MULLO_INT T1_X, T2_X
-      // T0_W (write masked) = MULLO_INT T1_X, T2_X
-      //
-      // Cube instructions:
-      // T0_XYZW = CUBE T1_XYZW
-      // becomes:
-      // TO_X = CUBE T1_Z, T1_Y
-      // T0_Y = CUBE T1_Z, T1_X
-      // T0_Z = CUBE T1_X, T1_Z
-      // T0_W = CUBE T1_Y, T1_Z
-      for (unsigned Chan = 0; Chan < 4; Chan++) {
-        unsigned DstReg = MI.getOperand(
-                            TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg();
-        unsigned Src0 = MI.getOperand(
-                           TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg();
-        unsigned Src1 = 0;
-
-        // Determine the correct source registers
-        if (!IsCube) {
-          int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1);
-          if (Src1Idx != -1) {
-            Src1 = MI.getOperand(Src1Idx).getReg();
-          }
-        }
-        if (IsReduction) {
-          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
-          Src0 = TRI.getSubReg(Src0, SubRegIndex);
-          Src1 = TRI.getSubReg(Src1, SubRegIndex);
-        } else if (IsCube) {
-          static const int CubeSrcSwz[] = {2, 2, 0, 1};
-          unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
-          unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
-          Src1 = TRI.getSubReg(Src0, SubRegIndex1);
-          Src0 = TRI.getSubReg(Src0, SubRegIndex0);
-        }
-
-        // Determine the correct destination registers;
-        bool Mask = false;
-        bool NotLast = true;
-        if (IsCube) {
-          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
-          DstReg = TRI.getSubReg(DstReg, SubRegIndex);
-        } else {
-          // Mask the write if the original instruction does not write to
-          // the current Channel.
-          Mask = (Chan != TRI.getHWRegChan(DstReg));
-          unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
-          DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
-        }
-
-        // Set the IsLast bit
-        NotLast = (Chan != 3 );
-
-        // Add the new instruction
-        unsigned Opcode = MI.getOpcode();
-        switch (Opcode) {
-        case AMDGPU::CUBE_r600_pseudo:
-          Opcode = AMDGPU::CUBE_r600_real;
-          break;
-        case AMDGPU::CUBE_eg_pseudo:
-          Opcode = AMDGPU::CUBE_eg_real;
-          break;
-        default:
-          break;
-        }
-
-        MachineInstr *NewMI =
-          TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);
-
-        if (Chan != 0)
-          NewMI->bundleWithPred();
-        if (Mask) {
-          TII->addFlag(NewMI, 0, MO_FLAG_MASK);
-        }
-        if (NotLast) {
-          TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
-        }
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg);
-        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg);
-      }
-      MI.eraseFromParent();
-    }
-  }
-  return false;
-}
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
deleted file mode 100644
index 8357b6d9d0e..00000000000
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ /dev/null
@@ -1,2286 +0,0 @@
-//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Custom DAG lowering for R600
-//
-//===----------------------------------------------------------------------===//
-
-#include "R600ISelLowering.h"
-#include "AMDGPUFrameLowering.h"
-#include "AMDGPUIntrinsicInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Function.h"
-
-using namespace llvm;
-
-R600TargetLowering::R600TargetLowering(TargetMachine &TM,
-                                       const AMDGPUSubtarget &STI)
-    : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
-  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
-  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
-  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
-  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
-  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
-  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
-
-  computeRegisterProperties(STI.getRegisterInfo());
-
-  // Set condition code actions
-  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
-  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
-  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
-  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
-  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
-
-  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
-  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
-  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
-  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
-
-  setOperationAction(ISD::FCOS, MVT::f32, Custom);
-  setOperationAction(ISD::FSIN, MVT::f32, Custom);
-
-  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
-  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
-
-  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
-  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
-  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
-
-  setOperationAction(ISD::FSUB, MVT::f32, Expand);
-
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
-
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
-
-  setOperationAction(ISD::SETCC, MVT::i32, Expand);
-  setOperationAction(ISD::SETCC, MVT::f32, Expand);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
-
-  setOperationAction(ISD::SELECT, MVT::i32, Expand);
-  setOperationAction(ISD::SELECT, MVT::f32, Expand);
-  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
-  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
-
-  // ADD, SUB overflow.
-  // TODO: turn these into Legal?
-  if (Subtarget->hasCARRY())
-    setOperationAction(ISD::UADDO, MVT::i32, Custom);
-
-  if (Subtarget->hasBORROW())
-    setOperationAction(ISD::USUBO, MVT::i32, Custom);
-
-  // Expand sign extension of vectors
-  if (!Subtarget->hasBFE())
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
-
-  if (!Subtarget->hasBFE())
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
-
-  if (!Subtarget->hasBFE())
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
-
-
-  // Legalize loads and stores to the private address space.
-  setOperationAction(ISD::LOAD, MVT::i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
-
-  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
-  // spaces, so it is custom lowered to handle those where it isn't.
-  for (MVT VT : MVT::integer_valuetypes()) {
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
-
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
-
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
-  }
-
-  setOperationAction(ISD::STORE, MVT::i8, Custom);
-  setOperationAction(ISD::STORE, MVT::i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
-  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
-  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
-
-  setOperationAction(ISD::LOAD, MVT::i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
-  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
-
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
-
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
-
-  setTargetDAGCombine(ISD::FP_ROUND);
-  setTargetDAGCombine(ISD::FP_TO_SINT);
-  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::SELECT_CC);
-  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
-
-  // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
-  //  to be Legal/Custom in order to avoid library calls.
-  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
-  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
-  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
-
-  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
-
-  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
-  for (MVT VT : ScalarIntVTs) {
-    setOperationAction(ISD::ADDC, VT, Expand);
-    setOperationAction(ISD::SUBC, VT, Expand);
-    setOperationAction(ISD::ADDE, VT, Expand);
-    setOperationAction(ISD::SUBE, VT, Expand);
-  }
-
-  setSchedulingPreference(Sched::Source);
-}
-
-MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
-    MachineInstr * MI, MachineBasicBlock * BB) const {
-  MachineFunction * MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-  MachineBasicBlock::iterator I = *MI;
-  const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
-
-  switch (MI->getOpcode()) {
-  default:
-    // Replace LDS_*_RET instruction that don't have any uses with the
-    // equivalent LDS_*_NORET instruction.
-    if (TII->isLDSRetInstr(MI->getOpcode())) {
-      int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
-      assert(DstIdx != -1);
-      MachineInstrBuilder NewMI;
-      // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
-      //        LDS_1A2D support and remove this special case.
-      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
-           MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
-        return BB;
-
-      NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
-                      TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
-      for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
-        NewMI.addOperand(MI->getOperand(i));
-      }
-    } else {
-      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
-    }
-    break;
-  case AMDGPU::CLAMP_R600: {
-    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
-                                                   AMDGPU::MOV,
-                                                   MI->getOperand(0).getReg(),
-                                                   MI->getOperand(1).getReg());
-    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
-    break;
-  }
-
-  case AMDGPU::FABS_R600: {
-    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
-                                                    AMDGPU::MOV,
-                                                    MI->getOperand(0).getReg(),
-                                                    MI->getOperand(1).getReg());
-    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
-    break;
-  }
-
-  case AMDGPU::FNEG_R600: {
-    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
-                                                    AMDGPU::MOV,
-                                                    MI->getOperand(0).getReg(),
-                                                    MI->getOperand(1).getReg());
-    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
-    break;
-  }
-
-  case AMDGPU::MASK_WRITE: {
-    unsigned maskedRegister = MI->getOperand(0).getReg();
-    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
-    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
-    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
-    break;
-  }
-
-  case AMDGPU::MOV_IMM_F32:
-    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
-                     MI->getOperand(1).getFPImm()->getValueAPF()
-                         .bitcastToAPInt().getZExtValue());
-    break;
-  case AMDGPU::MOV_IMM_I32:
-    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
-                     MI->getOperand(1).getImm());
-    break;
-  case AMDGPU::CONST_COPY: {
-    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
-        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
-    TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
-        MI->getOperand(1).getImm());
-    break;
-  }
-
-  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
-  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
-  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
-    unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
-
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1))
-            .addImm(EOP); // Set End of program bit
-    break;
-  }
-
-  case AMDGPU::TXD: {
-    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
-    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
-    MachineOperand &RID = MI->getOperand(4);
-    MachineOperand &SID = MI->getOperand(5);
-    unsigned TextureId = MI->getOperand(6).getImm();
-    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
-    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
-
-    switch (TextureId) {
-    case 5: // Rect
-      CTX = CTY = 0;
-      break;
-    case 6: // Shadow1D
-      SrcW = SrcZ;
-      break;
-    case 7: // Shadow2D
-      SrcW = SrcZ;
-      break;
-    case 8: // ShadowRect
-      CTX = CTY = 0;
-      SrcW = SrcZ;
-      break;
-    case 9: // 1DArray
-      SrcZ = SrcY;
-      CTZ = 0;
-      break;
-    case 10: // 2DArray
-      CTZ = 0;
-      break;
-    case 11: // Shadow1DArray
-      SrcZ = SrcY;
-      CTZ = 0;
-      break;
-    case 12: // Shadow2DArray
-      CTZ = 0;
-      break;
-    }
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
-            .addOperand(MI->getOperand(3))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
-            .addOperand(MI->getOperand(2))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW)
-            .addReg(T0, RegState::Implicit)
-            .addReg(T1, RegState::Implicit);
-    break;
-  }
-
-  case AMDGPU::TXD_SHADOW: {
-    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
-    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
-    MachineOperand &RID = MI->getOperand(4);
-    MachineOperand &SID = MI->getOperand(5);
-    unsigned TextureId = MI->getOperand(6).getImm();
-    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
-    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
-
-    switch (TextureId) {
-    case 5: // Rect
-      CTX = CTY = 0;
-      break;
-    case 6: // Shadow1D
-      SrcW = SrcZ;
-      break;
-    case 7: // Shadow2D
-      SrcW = SrcZ;
-      break;
-    case 8: // ShadowRect
-      CTX = CTY = 0;
-      SrcW = SrcZ;
-      break;
-    case 9: // 1DArray
-      SrcZ = SrcY;
-      CTZ = 0;
-      break;
-    case 10: // 2DArray
-      CTZ = 0;
-      break;
-    case 11: // Shadow1DArray
-      SrcZ = SrcY;
-      CTZ = 0;
-      break;
-    case 12: // Shadow2DArray
-      CTZ = 0;
-      break;
-    }
-
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
-            .addOperand(MI->getOperand(3))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
-            .addOperand(MI->getOperand(2))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1))
-            .addImm(SrcX)
-            .addImm(SrcY)
-            .addImm(SrcZ)
-            .addImm(SrcW)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(0)
-            .addImm(1)
-            .addImm(2)
-            .addImm(3)
-            .addOperand(RID)
-            .addOperand(SID)
-            .addImm(CTX)
-            .addImm(CTY)
-            .addImm(CTZ)
-            .addImm(CTW)
-            .addReg(T0, RegState::Implicit)
-            .addReg(T1, RegState::Implicit);
-    break;
-  }
-
-  case AMDGPU::BRANCH:
-      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
-              .addOperand(MI->getOperand(0));
-      break;
-
-  case AMDGPU::BRANCH_COND_f32: {
-    MachineInstr *NewMI =
-      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
-              AMDGPU::PREDICATE_BIT)
-              .addOperand(MI->getOperand(1))
-              .addImm(OPCODE_IS_NOT_ZERO)
-              .addImm(0); // Flags
-    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
-            .addOperand(MI->getOperand(0))
-            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
-    break;
-  }
-
-  case AMDGPU::BRANCH_COND_i32: {
-    MachineInstr *NewMI =
-      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
-            AMDGPU::PREDICATE_BIT)
-            .addOperand(MI->getOperand(1))
-            .addImm(OPCODE_IS_NOT_ZERO_INT)
-            .addImm(0); // Flags
-    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
-           .addOperand(MI->getOperand(0))
-            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
-    break;
-  }
-
-  case AMDGPU::EG_ExportSwz:
-  case AMDGPU::R600_ExportSwz: {
-    // Instruction is left unmodified if its not the last one of its type
-    bool isLastInstructionOfItsType = true;
-    unsigned InstExportType = MI->getOperand(1).getImm();
-    for (MachineBasicBlock::iterator NextExportInst = std::next(I),
-         EndBlock = BB->end(); NextExportInst != EndBlock;
-         NextExportInst = std::next(NextExportInst)) {
-      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
-          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
-        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
-            .getImm();
-        if (CurrentInstExportType == InstExportType) {
-          isLastInstructionOfItsType = false;
-          break;
-        }
-      }
-    }
-    bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
-    if (!EOP && !isLastInstructionOfItsType)
-      return BB;
-    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
-            .addOperand(MI->getOperand(0))
-            .addOperand(MI->getOperand(1))
-            .addOperand(MI->getOperand(2))
-            .addOperand(MI->getOperand(3))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6))
-            .addImm(CfInst)
-            .addImm(EOP);
-    break;
-  }
-  case AMDGPU::RETURN: {
-    // RETURN instructions must have the live-out registers as implicit uses,
-    // otherwise they appear dead.
-    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
-    MachineInstrBuilder MIB(*MF, MI);
-    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
-      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
-    return BB;
-  }
-  }
-
-  MI->eraseFromParent();
-  return BB;
-}
-
-//===----------------------------------------------------------------------===//
-// Custom DAG Lowering Operations
-//===----------------------------------------------------------------------===//
-
-SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-  switch (Op.getOpcode()) {
-  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
-  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
-  case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
-  case ISD::SRA_PARTS:
-  case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
-  case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
-  case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
-  case ISD::FCOS:
-  case ISD::FSIN: return LowerTrig(Op, DAG);
-  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
-  case ISD::STORE: return LowerSTORE(Op, DAG);
-  case ISD::LOAD: {
-    SDValue Result = LowerLOAD(Op, DAG);
-    assert((!Result.getNode() ||
-            Result.getNode()->getNumValues() == 2) &&
-           "Load should return a value and a chain");
-    return Result;
-  }
-
-  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
-  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
-  case ISD::INTRINSIC_VOID: {
-    SDValue Chain = Op.getOperand(0);
-    unsigned IntrinsicID =
-                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-    switch (IntrinsicID) {
-    case AMDGPUIntrinsic::AMDGPU_store_output: {
-      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
-      MFI->LiveOuts.push_back(Reg);
-      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
-    }
-    case AMDGPUIntrinsic::R600_store_swizzle: {
-      SDLoc DL(Op);
-      const SDValue Args[8] = {
-        Chain,
-        Op.getOperand(2), // Export Value
-        Op.getOperand(3), // ArrayBase
-        Op.getOperand(4), // Type
-        DAG.getConstant(0, DL, MVT::i32), // SWZ_X
-        DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
-        DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
-        DAG.getConstant(3, DL, MVT::i32) // SWZ_W
-      };
-      return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
-    }
-
-    // default for switch(IntrinsicID)
-    default: break;
-    }
-    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
-    break;
-  }
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntrinsicID =
-                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-    EVT VT = Op.getValueType();
-    SDLoc DL(Op);
-    switch(IntrinsicID) {
-    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-    case AMDGPUIntrinsic::R600_load_input: {
-      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
-      MachineFunction &MF = DAG.getMachineFunction();
-      MachineRegisterInfo &MRI = MF.getRegInfo();
-      MRI.addLiveIn(Reg);
-      return DAG.getCopyFromReg(DAG.getEntryNode(),
-          SDLoc(DAG.getEntryNode()), Reg, VT);
-    }
-
-    case AMDGPUIntrinsic::R600_interp_input: {
-      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
-      MachineSDNode *interp;
-      if (ijb < 0) {
-        const R600InstrInfo *TII =
-            static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
-        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
-            MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32));
-        return DAG.getTargetExtractSubreg(
-            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
-            DL, MVT::f32, SDValue(interp, 0));
-      }
-      MachineFunction &MF = DAG.getMachineFunction();
-      MachineRegisterInfo &MRI = MF.getRegInfo();
-      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
-      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
-      MRI.addLiveIn(RegisterI);
-      MRI.addLiveIn(RegisterJ);
-      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
-          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
-      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
-          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
-
-      if (slot % 4 < 2)
-        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
-            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
-            RegisterJNode, RegisterINode);
-      else
-        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
-            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
-            RegisterJNode, RegisterINode);
-      return SDValue(interp, slot % 2);
-    }
-    case AMDGPUIntrinsic::R600_interp_xy:
-    case AMDGPUIntrinsic::R600_interp_zw: {
-      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      MachineSDNode *interp;
-      SDValue RegisterINode = Op.getOperand(2);
-      SDValue RegisterJNode = Op.getOperand(3);
-
-      if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
-        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
-            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
-            RegisterJNode, RegisterINode);
-      else
-        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
-            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
-            RegisterJNode, RegisterINode);
-      return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
-          SDValue(interp, 0), SDValue(interp, 1));
-    }
-    case AMDGPUIntrinsic::R600_tex:
-    case AMDGPUIntrinsic::R600_texc:
-    case AMDGPUIntrinsic::R600_txl:
-    case AMDGPUIntrinsic::R600_txlc:
-    case AMDGPUIntrinsic::R600_txb:
-    case AMDGPUIntrinsic::R600_txbc:
-    case AMDGPUIntrinsic::R600_txf:
-    case AMDGPUIntrinsic::R600_txq:
-    case AMDGPUIntrinsic::R600_ddx:
-    case AMDGPUIntrinsic::R600_ddy:
-    case AMDGPUIntrinsic::R600_ldptr: {
-      unsigned TextureOp;
-      switch (IntrinsicID) {
-      case AMDGPUIntrinsic::R600_tex:
-        TextureOp = 0;
-        break;
-      case AMDGPUIntrinsic::R600_texc:
-        TextureOp = 1;
-        break;
-      case AMDGPUIntrinsic::R600_txl:
-        TextureOp = 2;
-        break;
-      case AMDGPUIntrinsic::R600_txlc:
-        TextureOp = 3;
-        break;
-      case AMDGPUIntrinsic::R600_txb:
-        TextureOp = 4;
-        break;
-      case AMDGPUIntrinsic::R600_txbc:
-        TextureOp = 5;
-        break;
-      case AMDGPUIntrinsic::R600_txf:
-        TextureOp = 6;
-        break;
-      case AMDGPUIntrinsic::R600_txq:
-        TextureOp = 7;
-        break;
-      case AMDGPUIntrinsic::R600_ddx:
-        TextureOp = 8;
-        break;
-      case AMDGPUIntrinsic::R600_ddy:
-        TextureOp = 9;
-        break;
-      case AMDGPUIntrinsic::R600_ldptr:
-        TextureOp = 10;
-        break;
-      default:
-        llvm_unreachable("Unknow Texture Operation");
-      }
-
-      SDValue TexArgs[19] = {
-        DAG.getConstant(TextureOp, DL, MVT::i32),
-        Op.getOperand(1),
-        DAG.getConstant(0, DL, MVT::i32),
-        DAG.getConstant(1, DL, MVT::i32),
-        DAG.getConstant(2, DL, MVT::i32),
-        DAG.getConstant(3, DL, MVT::i32),
-        Op.getOperand(2),
-        Op.getOperand(3),
-        Op.getOperand(4),
-        DAG.getConstant(0, DL, MVT::i32),
-        DAG.getConstant(1, DL, MVT::i32),
-        DAG.getConstant(2, DL, MVT::i32),
-        DAG.getConstant(3, DL, MVT::i32),
-        Op.getOperand(5),
-        Op.getOperand(6),
-        Op.getOperand(7),
-        Op.getOperand(8),
-        Op.getOperand(9),
-        Op.getOperand(10)
-      };
-      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
-    }
-    case AMDGPUIntrinsic::AMDGPU_dp4: {
-      SDValue Args[8] = {
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
-          DAG.getConstant(0, DL, MVT::i32)),
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
-          DAG.getConstant(0, DL, MVT::i32)),
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
-          DAG.getConstant(1, DL, MVT::i32)),
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
-          DAG.getConstant(1, DL, MVT::i32)),
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
-          DAG.getConstant(2, DL, MVT::i32)),
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
-          DAG.getConstant(2, DL, MVT::i32)),
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
-          DAG.getConstant(3, DL, MVT::i32)),
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
-          DAG.getConstant(3, DL, MVT::i32))
-      };
-      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
-    }
-
-    case Intrinsic::r600_read_ngroups_x:
-      return LowerImplicitParameter(DAG, VT, DL, 0);
-    case Intrinsic::r600_read_ngroups_y:
-      return LowerImplicitParameter(DAG, VT, DL, 1);
-    case Intrinsic::r600_read_ngroups_z:
-      return LowerImplicitParameter(DAG, VT, DL, 2);
-    case Intrinsic::r600_read_global_size_x:
-      return LowerImplicitParameter(DAG, VT, DL, 3);
-    case Intrinsic::r600_read_global_size_y:
-      return LowerImplicitParameter(DAG, VT, DL, 4);
-    case Intrinsic::r600_read_global_size_z:
-      return LowerImplicitParameter(DAG, VT, DL, 5);
-    case Intrinsic::r600_read_local_size_x:
-      return LowerImplicitParameter(DAG, VT, DL, 6);
-    case Intrinsic::r600_read_local_size_y:
-      return LowerImplicitParameter(DAG, VT, DL, 7);
-    case Intrinsic::r600_read_local_size_z:
-      return LowerImplicitParameter(DAG, VT, DL, 8);
-
-    case Intrinsic::AMDGPU_read_workdim:
-      return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
-
-    case Intrinsic::r600_read_tgid_x:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T1_X, VT);
-    case Intrinsic::r600_read_tgid_y:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T1_Y, VT);
-    case Intrinsic::r600_read_tgid_z:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T1_Z, VT);
-    case Intrinsic::r600_read_tidig_x:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T0_X, VT);
-    case Intrinsic::r600_read_tidig_y:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T0_Y, VT);
-    case Intrinsic::r600_read_tidig_z:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T0_Z, VT);
-    case Intrinsic::AMDGPU_rsq:
-      // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
-      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
-
-    case AMDGPUIntrinsic::AMDGPU_fract:
-    case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
-      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
-    }
-    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
-    break;
-  }
-  } // end switch(Op.getOpcode())
-  return SDValue();
-}
-
-void R600TargetLowering::ReplaceNodeResults(SDNode *N,
-                                            SmallVectorImpl<SDValue> &Results,
-                                            SelectionDAG &DAG) const {
-  switch (N->getOpcode()) {
-  default:
-    AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
-    return;
-  case ISD::FP_TO_UINT:
-    if (N->getValueType(0) == MVT::i1) {
-      Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
-      return;
-    }
-    // Fall-through. Since we don't care about out of bounds values
-    // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
-    // considers some extra cases which are not necessary here.
-  case ISD::FP_TO_SINT: {
-    SDValue Result;
-    if (expandFP_TO_SINT(N, Result, DAG))
-      Results.push_back(Result);
-    return;
-  }
-  case ISD::SDIVREM: {
-    SDValue Op = SDValue(N, 1);
-    SDValue RES = LowerSDIVREM(Op, DAG);
-    Results.push_back(RES);
-    Results.push_back(RES.getValue(1));
-    break;
-  }
-  case ISD::UDIVREM: {
-    SDValue Op = SDValue(N, 0);
-    LowerUDIVREM64(Op, DAG, Results);
-    break;
-  }
-  }
-}
-
-SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
-                                                   SDValue Vector) const {
-
-  SDLoc DL(Vector);
-  EVT VecVT = Vector.getValueType();
-  EVT EltVT = VecVT.getVectorElementType();
-  SmallVector<SDValue, 8> Args;
-
-  for (unsigned i = 0, e = VecVT.getVectorNumElements();
-                                                           i != e; ++i) {
-    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
-                               DAG.getConstant(i, DL, getVectorIdxTy())));
-  }
-
-  return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
-}
-
-SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
-                                                    SelectionDAG &DAG) const {
-
-  SDLoc DL(Op);
-  SDValue Vector = Op.getOperand(0);
-  SDValue Index = Op.getOperand(1);
-
-  if (isa<ConstantSDNode>(Index) ||
-      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
-    return Op;
-
-  Vector = vectorToVerticalVector(DAG, Vector);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
-                     Vector, Index);
-}
-
-SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
-                                                   SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  SDValue Vector = Op.getOperand(0);
-  SDValue Value = Op.getOperand(1);
-  SDValue Index = Op.getOperand(2);
-
-  if (isa<ConstantSDNode>(Index) ||
-      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
-    return Op;
-
-  Vector = vectorToVerticalVector(DAG, Vector);
-  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
-                               Vector, Value, Index);
-  return vectorToVerticalVector(DAG, Insert);
-}
-
-SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
-  // On hw >= R700, COS/SIN input must be between -1. and 1.
-  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
-  EVT VT = Op.getValueType();
-  SDValue Arg = Op.getOperand(0);
-  SDLoc DL(Op);
-  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
-      DAG.getNode(ISD::FADD, DL, VT,
-        DAG.getNode(ISD::FMUL, DL, VT, Arg,
-          DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
-        DAG.getConstantFP(0.5, DL, MVT::f32)));
-  unsigned TrigNode;
-  switch (Op.getOpcode()) {
-  case ISD::FCOS:
-    TrigNode = AMDGPUISD::COS_HW;
-    break;
-  case ISD::FSIN:
-    TrigNode = AMDGPUISD::SIN_HW;
-    break;
-  default:
-    llvm_unreachable("Wrong trig opcode");
-  }
-  SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
-      DAG.getNode(ISD::FADD, DL, VT, FractPart,
-        DAG.getConstantFP(-0.5, DL, MVT::f32)));
-  if (Gen >= AMDGPUSubtarget::R700)
-    return TrigVal;
-  // On R600 hw, COS/SIN input must be between -Pi and Pi.
-  return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
-      DAG.getConstantFP(3.14159265359, DL, MVT::f32));
-}
-
-SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-
-  SDValue Lo = Op.getOperand(0);
-  SDValue Hi = Op.getOperand(1);
-  SDValue Shift = Op.getOperand(2);
-  SDValue Zero = DAG.getConstant(0, DL, VT);
-  SDValue One  = DAG.getConstant(1, DL, VT);
-
-  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
-  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
-  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
-  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
-
-  // The dance around Width1 is necessary for 0 special case.
-  // Without it the CompShift might be 32, producing incorrect results in
-  // Overflow. So we do the shift in two steps, the alternative is to
-  // add a conditional to filter the special case.
-
-  SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
-  Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
-
-  SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
-  HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
-  SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
-
-  SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
-  SDValue LoBig = Zero;
-
-  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
-  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
-
-  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
-}
-
-SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-
-  SDValue Lo = Op.getOperand(0);
-  SDValue Hi = Op.getOperand(1);
-  SDValue Shift = Op.getOperand(2);
-  SDValue Zero = DAG.getConstant(0, DL, VT);
-  SDValue One  = DAG.getConstant(1, DL, VT);
-
-  const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
-
-  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
-  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
-  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
-  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
-
-  // The dance around Width1 is necessary for 0 special case.
-  // Without it the CompShift might be 32, producing incorrect results in
-  // Overflow. So we do the shift in two steps, the alternative is to
-  // add a conditional to filter the special case.
-
-  SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
-  Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
-
-  SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
-  SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
-  LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
-
-  SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
-  SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
-
-  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
-  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
-
-  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
-}
-
-SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
-                                          unsigned mainop, unsigned ovf) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-
-  SDValue Lo = Op.getOperand(0);
-  SDValue Hi = Op.getOperand(1);
-
-  SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
-  // Extend sign.
-  OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
-                    DAG.getValueType(MVT::i1));
-
-  SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
-
-  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
-}
-
-SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  return DAG.getNode(
-      ISD::SETCC,
-      DL,
-      MVT::i1,
-      Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
-      DAG.getCondCode(ISD::SETNE)
-      );
-}
-
-SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
-                                                   SDLoc DL,
-                                                   unsigned DwordOffset) const {
-  unsigned ByteOffset = DwordOffset * 4;
-  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                      AMDGPUAS::CONSTANT_BUFFER_0);
-
-  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
-  assert(isInt<16>(ByteOffset));
-
-  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
-                     DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
-                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
-                     false, false, false, 0);
-}
-
-bool R600TargetLowering::isZero(SDValue Op) const {
-  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
-    return Cst->isNullValue();
-  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
-    return CstFP->isZero();
-  } else {
-    return false;
-  }
-}
-
-SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue True = Op.getOperand(2);
-  SDValue False = Op.getOperand(3);
-  SDValue CC = Op.getOperand(4);
-  SDValue Temp;
-
-  if (VT == MVT::f32) {
-    DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
-    SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
-    if (MinMax)
-      return MinMax;
-  }
-
-  // LHS and RHS are guaranteed to be the same value type
-  EVT CompareVT = LHS.getValueType();
-
-  // Check if we can lower this to a native operation.
-
-  // Try to lower to a SET* instruction:
-  //
-  // SET* can match the following patterns:
-  //
-  // select_cc f32, f32, -1,  0, cc_supported
-  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
-  // select_cc i32, i32, -1,  0, cc_supported
-  //
-
-  // Move hardware True/False values to the correct operand.
-  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
-  ISD::CondCode InverseCC =
-     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
-  if (isHWTrueValue(False) && isHWFalseValue(True)) {
-    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
-      std::swap(False, True);
-      CC = DAG.getCondCode(InverseCC);
-    } else {
-      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
-      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
-        std::swap(False, True);
-        std::swap(LHS, RHS);
-        CC = DAG.getCondCode(SwapInvCC);
-      }
-    }
-  }
-
-  if (isHWTrueValue(True) && isHWFalseValue(False) &&
-      (CompareVT == VT || VT == MVT::i32)) {
-    // This can be matched by a SET* instruction.
-    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
-  }
-
-  // Try to lower to a CND* instruction:
-  //
-  // CND* can match the following patterns:
-  //
-  // select_cc f32, 0.0, f32, f32, cc_supported
-  // select_cc f32, 0.0, i32, i32, cc_supported
-  // select_cc i32, 0,   f32, f32, cc_supported
-  // select_cc i32, 0,   i32, i32, cc_supported
-  //
-
-  // Try to move the zero value to the RHS
-  if (isZero(LHS)) {
-    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
-    // Try swapping the operands
-    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
-    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
-      std::swap(LHS, RHS);
-      CC = DAG.getCondCode(CCSwapped);
-    } else {
-      // Try inverting the conditon and then swapping the operands
-      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
-      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
-      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
-        std::swap(True, False);
-        std::swap(LHS, RHS);
-        CC = DAG.getCondCode(CCSwapped);
-      }
-    }
-  }
-  if (isZero(RHS)) {
-    SDValue Cond = LHS;
-    SDValue Zero = RHS;
-    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
-    if (CompareVT != VT) {
-      // Bitcast True / False to the correct types.  This will end up being
-      // a nop, but it allows us to define only a single pattern in the
-      // .TD files for each CND* instruction rather than having to have
-      // one pattern for integer True/False and one for fp True/False
-      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
-      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
-    }
-
-    switch (CCOpcode) {
-    case ISD::SETONE:
-    case ISD::SETUNE:
-    case ISD::SETNE:
-      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
-      Temp = True;
-      True = False;
-      False = Temp;
-      break;
-    default:
-      break;
-    }
-    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
-        Cond, Zero,
-        True, False,
-        DAG.getCondCode(CCOpcode));
-    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
-  }
-
-  // If we make it this for it means we have no native instructions to handle
-  // this SELECT_CC, so we must lower it.
-  SDValue HWTrue, HWFalse;
-
-  if (CompareVT == MVT::f32) {
-    HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
-    HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
-  } else if (CompareVT == MVT::i32) {
-    HWTrue = DAG.getConstant(-1, DL, CompareVT);
-    HWFalse = DAG.getConstant(0, DL, CompareVT);
-  }
-  else {
-    llvm_unreachable("Unhandled value type in LowerSELECT_CC");
-  }
-
-  // Lower this unsupported SELECT_CC into a combination of two supported
-  // SELECT_CC operations.
-  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
-
-  return DAG.getNode(ISD::SELECT_CC, DL, VT,
-      Cond, HWFalse,
-      True, False,
-      DAG.getCondCode(ISD::SETNE));
-}
-
-/// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
-/// convert these pointers to a register index.  Each register holds
-/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
-/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
-/// for indirect addressing.
-SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
-                                               unsigned StackWidth,
-                                               SelectionDAG &DAG) const {
-  unsigned SRLPad;
-  switch(StackWidth) {
-  case 1:
-    SRLPad = 2;
-    break;
-  case 2:
-    SRLPad = 3;
-    break;
-  case 4:
-    SRLPad = 4;
-    break;
-  default: llvm_unreachable("Invalid stack width");
-  }
-
-  SDLoc DL(Ptr);
-  return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
-                     DAG.getConstant(SRLPad, DL, MVT::i32));
-}
-
-void R600TargetLowering::getStackAddress(unsigned StackWidth,
-                                         unsigned ElemIdx,
-                                         unsigned &Channel,
-                                         unsigned &PtrIncr) const {
-  switch (StackWidth) {
-  default:
-  case 1:
-    Channel = 0;
-    if (ElemIdx > 0) {
-      PtrIncr = 1;
-    } else {
-      PtrIncr = 0;
-    }
-    break;
-  case 2:
-    Channel = ElemIdx % 2;
-    if (ElemIdx == 2) {
-      PtrIncr = 1;
-    } else {
-      PtrIncr = 0;
-    }
-    break;
-  case 4:
-    Channel = ElemIdx;
-    PtrIncr = 0;
-    break;
-  }
-}
-
-SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue Value = Op.getOperand(1);
-  SDValue Ptr = Op.getOperand(2);
-
-  SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
-  if (Result.getNode()) {
-    return Result;
-  }
-
-  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
-    if (StoreNode->isTruncatingStore()) {
-      EVT VT = Value.getValueType();
-      assert(VT.bitsLE(MVT::i32));
-      EVT MemVT = StoreNode->getMemoryVT();
-      SDValue MaskConstant;
-      if (MemVT == MVT::i8) {
-        MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
-      } else {
-        assert(MemVT == MVT::i16);
-        MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
-      }
-      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
-                                      DAG.getConstant(2, DL, MVT::i32));
-      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
-                                      DAG.getConstant(0x00000003, DL, VT));
-      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
-      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
-                                   DAG.getConstant(3, DL, VT));
-      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
-      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
-      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
-      // vector instead.
-      SDValue Src[4] = {
-        ShiftedValue,
-        DAG.getConstant(0, DL, MVT::i32),
-        DAG.getConstant(0, DL, MVT::i32),
-        Mask
-      };
-      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
-      SDValue Args[3] = { Chain, Input, DWordAddr };
-      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
-                                     Op->getVTList(), Args, MemVT,
-                                     StoreNode->getMemOperand());
-    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
-               Value.getValueType().bitsGE(MVT::i32)) {
-      // Convert pointer from byte address to dword address.
-      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
-                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
-                                    Ptr, DAG.getConstant(2, DL, MVT::i32)));
-
-      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
-        llvm_unreachable("Truncated and indexed stores not supported yet");
-      } else {
-        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
-      }
-      return Chain;
-    }
-  }
-
-  EVT ValueVT = Value.getValueType();
-
-  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
-    return SDValue();
-  }
-
-  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
-  if (Ret.getNode()) {
-    return Ret;
-  }
-  // Lowering for indirect addressing
-
-  const MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL =
-      static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
-  unsigned StackWidth = TFL->getStackWidth(MF);
-
-  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
-
-  if (ValueVT.isVector()) {
-    unsigned NumElemVT = ValueVT.getVectorNumElements();
-    EVT ElemVT = ValueVT.getVectorElementType();
-    SmallVector<SDValue, 4> Stores(NumElemVT);
-
-    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
-                                      "vector width in load");
-
-    for (unsigned i = 0; i < NumElemVT; ++i) {
-      unsigned Channel, PtrIncr;
-      getStackAddress(StackWidth, i, Channel, PtrIncr);
-      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
-                        DAG.getConstant(PtrIncr, DL, MVT::i32));
-      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
-                                 Value, DAG.getConstant(i, DL, MVT::i32));
-
-      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
-                              Chain, Elem, Ptr,
-                              DAG.getTargetConstant(Channel, DL, MVT::i32));
-    }
-     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
-   } else {
-    if (ValueVT == MVT::i8) {
-      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
-    }
-    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
-    DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
-  }
-
-  return Chain;
-}
-
-// return (512 + (kc_bank << 12)
-static int
-ConstantAddressBlock(unsigned AddressSpace) {
-  switch (AddressSpace) {
-  case AMDGPUAS::CONSTANT_BUFFER_0:
-    return 512;
-  case AMDGPUAS::CONSTANT_BUFFER_1:
-    return 512 + 4096;
-  case AMDGPUAS::CONSTANT_BUFFER_2:
-    return 512 + 4096 * 2;
-  case AMDGPUAS::CONSTANT_BUFFER_3:
-    return 512 + 4096 * 3;
-  case AMDGPUAS::CONSTANT_BUFFER_4:
-    return 512 + 4096 * 4;
-  case AMDGPUAS::CONSTANT_BUFFER_5:
-    return 512 + 4096 * 5;
-  case AMDGPUAS::CONSTANT_BUFFER_6:
-    return 512 + 4096 * 6;
-  case AMDGPUAS::CONSTANT_BUFFER_7:
-    return 512 + 4096 * 7;
-  case AMDGPUAS::CONSTANT_BUFFER_8:
-    return 512 + 4096 * 8;
-  case AMDGPUAS::CONSTANT_BUFFER_9:
-    return 512 + 4096 * 9;
-  case AMDGPUAS::CONSTANT_BUFFER_10:
-    return 512 + 4096 * 10;
-  case AMDGPUAS::CONSTANT_BUFFER_11:
-    return 512 + 4096 * 11;
-  case AMDGPUAS::CONSTANT_BUFFER_12:
-    return 512 + 4096 * 12;
-  case AMDGPUAS::CONSTANT_BUFFER_13:
-    return 512 + 4096 * 13;
-  case AMDGPUAS::CONSTANT_BUFFER_14:
-    return 512 + 4096 * 14;
-  case AMDGPUAS::CONSTANT_BUFFER_15:
-    return 512 + 4096 * 15;
-  default:
-    return -1;
-  }
-}
-
-SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
-{
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue Ptr = Op.getOperand(1);
-  SDValue LoweredLoad;
-
-  SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
-  if (Ret.getNode()) {
-    SDValue Ops[2] = {
-      Ret,
-      Chain
-    };
-    return DAG.getMergeValues(Ops, DL);
-  }
-
-  // Lower loads constant address space global variable loads
-  if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
-      isa<GlobalVariable>(GetUnderlyingObject(
-          LoadNode->getMemOperand()->getValue(), *getDataLayout()))) {
-
-    SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
-        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
-    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
-        DAG.getConstant(2, DL, MVT::i32));
-    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
-                       LoadNode->getChain(), Ptr,
-                       DAG.getTargetConstant(0, DL, MVT::i32),
-                       Op.getOperand(2));
-  }
-
-  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
-    SDValue MergedValues[2] = {
-      ScalarizeVectorLoad(Op, DAG),
-      Chain
-    };
-    return DAG.getMergeValues(MergedValues, DL);
-  }
-
-  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
-  if (ConstantBlock > -1 &&
-      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
-       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
-    SDValue Result;
-    if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
-        isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
-        isa<ConstantSDNode>(Ptr)) {
-      SDValue Slots[4];
-      for (unsigned i = 0; i < 4; i++) {
-        // We want Const position encoded with the following formula :
-        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
-        // const_index is Ptr computed by llvm using an alignment of 16.
-        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
-        // then div by 4 at the ISel step
-        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
-            DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
-        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
-      }
-      EVT NewVT = MVT::v4i32;
-      unsigned NumElements = 4;
-      if (VT.isVector()) {
-        NewVT = VT;
-        NumElements = VT.getVectorNumElements();
-      }
-      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
-                           makeArrayRef(Slots, NumElements));
-    } else {
-      // non-constant ptr can't be folded, keeps it as a v4f32 load
-      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
-          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
-                      DAG.getConstant(4, DL, MVT::i32)),
-                      DAG.getConstant(LoadNode->getAddressSpace() -
-                                      AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
-          );
-    }
-
-    if (!VT.isVector()) {
-      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
-                           DAG.getConstant(0, DL, MVT::i32));
-    }
-
-    SDValue MergedValues[2] = {
-      Result,
-      Chain
-    };
-    return DAG.getMergeValues(MergedValues, DL);
-  }
-
-  // For most operations returning SDValue() will result in the node being
-  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
-  // need to manually expand loads that may be legal in some address spaces and
-  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
-  // compute shaders, since the data is sign extended when it is uploaded to the
-  // buffer. However SEXT loads from other address spaces are not supported, so
-  // we need to expand them here.
-  if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
-    EVT MemVT = LoadNode->getMemoryVT();
-    assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
-    SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
-                                  LoadNode->getPointerInfo(), MemVT,
-                                  LoadNode->isVolatile(),
-                                  LoadNode->isNonTemporal(),
-                                  LoadNode->isInvariant(),
-                                  LoadNode->getAlignment());
-    SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
-                              DAG.getValueType(MemVT));
-
-    SDValue MergedValues[2] = { Res, Chain };
-    return DAG.getMergeValues(MergedValues, DL);
-  }
-
-  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
-    return SDValue();
-  }
-
-  // Lowering for indirect addressing
-  const MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL =
-      static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
-  unsigned StackWidth = TFL->getStackWidth(MF);
-
-  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
-
-  if (VT.isVector()) {
-    unsigned NumElemVT = VT.getVectorNumElements();
-    EVT ElemVT = VT.getVectorElementType();
-    SDValue Loads[4];
-
-    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
-                                      "vector width in load");
-
-    for (unsigned i = 0; i < NumElemVT; ++i) {
-      unsigned Channel, PtrIncr;
-      getStackAddress(StackWidth, i, Channel, PtrIncr);
-      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
-                        DAG.getConstant(PtrIncr, DL, MVT::i32));
-      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
-                             Chain, Ptr,
-                             DAG.getTargetConstant(Channel, DL, MVT::i32),
-                             Op.getOperand(2));
-    }
-    for (unsigned i = NumElemVT; i < 4; ++i) {
-      Loads[i] = DAG.getUNDEF(ElemVT);
-    }
-    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
-    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
-  } else {
-    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
-                              Chain, Ptr,
-                              DAG.getTargetConstant(0, DL, MVT::i32), // Channel
-                              Op.getOperand(2));
-  }
-
-  SDValue Ops[2] = {
-    LoweredLoad,
-    Chain
-  };
-
-  return DAG.getMergeValues(Ops, DL);
-}
-
-SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Chain = Op.getOperand(0);
-  SDValue Cond  = Op.getOperand(1);
-  SDValue Jump  = Op.getOperand(2);
-
-  return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
-                     Chain, Jump, Cond);
-}
-
-/// XXX Only kernel functions are supported, so we can assume for now that
-/// every function is a kernel function, but in the future we should use
-/// separate calling conventions for kernel and non-kernel functions.
-SDValue R600TargetLowering::LowerFormalArguments(
-                                      SDValue Chain,
-                                      CallingConv::ID CallConv,
-                                      bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      SDLoc DL, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                 *DAG.getContext());
-  MachineFunction &MF = DAG.getMachineFunction();
-  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-
-  SmallVector<ISD::InputArg, 8> LocalIns;
-
-  getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
-
-  AnalyzeFormalArguments(CCInfo, LocalIns);
-
-  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-    const ISD::InputArg &In = Ins[i];
-    EVT VT = In.VT;
-    EVT MemVT = VA.getLocVT();
-    if (!VT.isVector() && MemVT.isVector()) {
-      // Get load source type if scalarized.
-      MemVT = MemVT.getVectorElementType();
-    }
-
-    if (MFI->getShaderType() != ShaderType::COMPUTE) {
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
-      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
-      InVals.push_back(Register);
-      continue;
-    }
-
-    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                          AMDGPUAS::CONSTANT_BUFFER_0);
-
-    // i64 isn't a legal type, so the register type used ends up as i32, which
-    // isn't expected here. It attempts to create this sextload, but it ends up
-    // being invalid. Somehow this seems to work with i64 arguments, but breaks
-    // for <1 x i64>.
-
-    // The first 36 bytes of the input buffer contains information about
-    // thread group and global sizes.
-    ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
-    if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
-      // FIXME: This should really check the extload type, but the handling of
-      // extload vector parameters seems to be broken.
-
-      // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
-      Ext = ISD::SEXTLOAD;
-    }
-
-    // Compute the offset from the value.
-    // XXX - I think PartOffset should give you this, but it seems to give the
-    // size of the register which isn't useful.
-
-    unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
-    unsigned PartOffset = VA.getLocMemOffset();
-    unsigned Offset = 36 + VA.getLocMemOffset();
-
-    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
-    SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
-                              DAG.getConstant(Offset, DL, MVT::i32),
-                              DAG.getUNDEF(MVT::i32),
-                              PtrInfo,
-                              MemVT, false, true, true, 4);
-
-    // 4 is the preferred alignment for the CONSTANT memory space.
-    InVals.push_back(Arg);
-    MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
-  }
-  return Chain;
-}
-
-EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-   if (!VT.isVector())
-     return MVT::i32;
-   return VT.changeVectorElementTypeToInteger();
-}
-
-static SDValue CompactSwizzlableVector(
-  SelectionDAG &DAG, SDValue VectorEntry,
-  DenseMap<unsigned, unsigned> &RemapSwizzle) {
-  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
-  assert(RemapSwizzle.empty());
-  SDValue NewBldVec[4] = {
-    VectorEntry.getOperand(0),
-    VectorEntry.getOperand(1),
-    VectorEntry.getOperand(2),
-    VectorEntry.getOperand(3)
-  };
-
-  for (unsigned i = 0; i < 4; i++) {
-    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
-      // We mask write here to teach later passes that the ith element of this
-      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
-      // break false dependencies and additionnaly make assembly easier to read.
-      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
-    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
-      if (C->isZero()) {
-        RemapSwizzle[i] = 4; // SEL_0
-        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
-      } else if (C->isExactlyValue(1.0)) {
-        RemapSwizzle[i] = 5; // SEL_1
-        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
-      }
-    }
-
-    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
-      continue;
-    for (unsigned j = 0; j < i; j++) {
-      if (NewBldVec[i] == NewBldVec[j]) {
-        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
-        RemapSwizzle[i] = j;
-        break;
-      }
-    }
-  }
-
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
-                     VectorEntry.getValueType(), NewBldVec);
-}
-
-static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
-                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
-  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
-  assert(RemapSwizzle.empty());
-  SDValue NewBldVec[4] = {
-      VectorEntry.getOperand(0),
-      VectorEntry.getOperand(1),
-      VectorEntry.getOperand(2),
-      VectorEntry.getOperand(3)
-  };
-  bool isUnmovable[4] = { false, false, false, false };
-  for (unsigned i = 0; i < 4; i++) {
-    RemapSwizzle[i] = i;
-    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
-      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
-          ->getZExtValue();
-      if (i == Idx)
-        isUnmovable[Idx] = true;
-    }
-  }
-
-  for (unsigned i = 0; i < 4; i++) {
-    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
-      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
-          ->getZExtValue();
-      if (isUnmovable[Idx])
-        continue;
-      // Swap i and Idx
-      std::swap(NewBldVec[Idx], NewBldVec[i]);
-      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
-      break;
-    }
-  }
-
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
-                     VectorEntry.getValueType(), NewBldVec);
-}
-
-
-SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
-                                            SDValue Swz[4], SelectionDAG &DAG,
-                                            SDLoc DL) const {
-  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
-  // Old -> New swizzle values
-  DenseMap<unsigned, unsigned> SwizzleRemap;
-
-  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
-  for (unsigned i = 0; i < 4; i++) {
-    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
-    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
-      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
-  }
-
-  SwizzleRemap.clear();
-  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
-  for (unsigned i = 0; i < 4; i++) {
-    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
-    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
-      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
-  }
-
-  return BuildVector;
-}
-
-
-//===----------------------------------------------------------------------===//
-// Custom DAG Optimizations
-//===----------------------------------------------------------------------===//
-
-SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
-                                              DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
-
-  switch (N->getOpcode()) {
-  default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
-  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
-  case ISD::FP_ROUND: {
-      SDValue Arg = N->getOperand(0);
-      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
-        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
-                           Arg.getOperand(0));
-      }
-      break;
-    }
-
-  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
-  // (i32 select_cc f32, f32, -1, 0 cc)
-  //
-  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
-  // this to one of the SET*_DX10 instructions.
-  case ISD::FP_TO_SINT: {
-    SDValue FNeg = N->getOperand(0);
-    if (FNeg.getOpcode() != ISD::FNEG) {
-      return SDValue();
-    }
-    SDValue SelectCC = FNeg.getOperand(0);
-    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
-        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
-        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
-        !isHWTrueValue(SelectCC.getOperand(2)) ||
-        !isHWFalseValue(SelectCC.getOperand(3))) {
-      return SDValue();
-    }
-
-    SDLoc dl(N);
-    return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
-                           SelectCC.getOperand(0), // LHS
-                           SelectCC.getOperand(1), // RHS
-                           DAG.getConstant(-1, dl, MVT::i32), // True
-                           DAG.getConstant(0, dl, MVT::i32),  // False
-                           SelectCC.getOperand(4)); // CC
-
-    break;
-  }
-
-  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
-  // => build_vector elt0, ... , NewEltIdx, ... , eltN
-  case ISD::INSERT_VECTOR_ELT: {
-    SDValue InVec = N->getOperand(0);
-    SDValue InVal = N->getOperand(1);
-    SDValue EltNo = N->getOperand(2);
-    SDLoc dl(N);
-
-    // If the inserted element is an UNDEF, just use the input vector.
-    if (InVal.getOpcode() == ISD::UNDEF)
-      return InVec;
-
-    EVT VT = InVec.getValueType();
-
-    // If we can't generate a legal BUILD_VECTOR, exit
-    if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
-      return SDValue();
-
-    // Check that we know which element is being inserted
-    if (!isa<ConstantSDNode>(EltNo))
-      return SDValue();
-    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-
-    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
-    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
-    // vector elements.
-    SmallVector<SDValue, 8> Ops;
-    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
-      Ops.append(InVec.getNode()->op_begin(),
-                 InVec.getNode()->op_end());
-    } else if (InVec.getOpcode() == ISD::UNDEF) {
-      unsigned NElts = VT.getVectorNumElements();
-      Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
-    } else {
-      return SDValue();
-    }
-
-    // Insert the element
-    if (Elt < Ops.size()) {
-      // All the operands of BUILD_VECTOR must have the same type;
-      // we enforce that here.
-      EVT OpVT = Ops[0].getValueType();
-      if (InVal.getValueType() != OpVT)
-        InVal = OpVT.bitsGT(InVal.getValueType()) ?
-          DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
-          DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
-      Ops[Elt] = InVal;
-    }
-
-    // Return the new vector
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
-  }
-
-  // Extract_vec (Build_vector) generated by custom lowering
-  // also needs to be customly combined
-  case ISD::EXTRACT_VECTOR_ELT: {
-    SDValue Arg = N->getOperand(0);
-    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
-      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
-        unsigned Element = Const->getZExtValue();
-        return Arg->getOperand(Element);
-      }
-    }
-    if (Arg.getOpcode() == ISD::BITCAST &&
-        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
-      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
-        unsigned Element = Const->getZExtValue();
-        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
-            Arg->getOperand(0).getOperand(Element));
-      }
-    }
-  }
-
-  case ISD::SELECT_CC: {
-    // Try common optimizations
-    SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
-    if (Ret.getNode())
-      return Ret;
-
-    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
-    //      selectcc x, y, a, b, inv(cc)
-    //
-    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
-    //      selectcc x, y, a, b, cc
-    SDValue LHS = N->getOperand(0);
-    if (LHS.getOpcode() != ISD::SELECT_CC) {
-      return SDValue();
-    }
-
-    SDValue RHS = N->getOperand(1);
-    SDValue True = N->getOperand(2);
-    SDValue False = N->getOperand(3);
-    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
-
-    if (LHS.getOperand(2).getNode() != True.getNode() ||
-        LHS.getOperand(3).getNode() != False.getNode() ||
-        RHS.getNode() != False.getNode()) {
-      return SDValue();
-    }
-
-    switch (NCC) {
-    default: return SDValue();
-    case ISD::SETNE: return LHS;
-    case ISD::SETEQ: {
-      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
-      LHSCC = ISD::getSetCCInverse(LHSCC,
-                                  LHS.getOperand(0).getValueType().isInteger());
-      if (DCI.isBeforeLegalizeOps() ||
-          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
-        return DAG.getSelectCC(SDLoc(N),
-                               LHS.getOperand(0),
-                               LHS.getOperand(1),
-                               LHS.getOperand(2),
-                               LHS.getOperand(3),
-                               LHSCC);
-      break;
-    }
-    }
-    return SDValue();
-  }
-
-  case AMDGPUISD::EXPORT: {
-    SDValue Arg = N->getOperand(1);
-    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
-      break;
-
-    SDValue NewArgs[8] = {
-      N->getOperand(0), // Chain
-      SDValue(),
-      N->getOperand(2), // ArrayBase
-      N->getOperand(3), // Type
-      N->getOperand(4), // SWZ_X
-      N->getOperand(5), // SWZ_Y
-      N->getOperand(6), // SWZ_Z
-      N->getOperand(7) // SWZ_W
-    };
-    SDLoc DL(N);
-    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
-    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
-  }
-  case AMDGPUISD::TEXTURE_FETCH: {
-    SDValue Arg = N->getOperand(1);
-    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
-      break;
-
-    SDValue NewArgs[19] = {
-      N->getOperand(0),
-      N->getOperand(1),
-      N->getOperand(2),
-      N->getOperand(3),
-      N->getOperand(4),
-      N->getOperand(5),
-      N->getOperand(6),
-      N->getOperand(7),
-      N->getOperand(8),
-      N->getOperand(9),
-      N->getOperand(10),
-      N->getOperand(11),
-      N->getOperand(12),
-      N->getOperand(13),
-      N->getOperand(14),
-      N->getOperand(15),
-      N->getOperand(16),
-      N->getOperand(17),
-      N->getOperand(18),
-    };
-    SDLoc DL(N);
-    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
-    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
-  }
-  }
-
-  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
-}
-
-static bool
-FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
-            SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
-  const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
-  if (!Src.isMachineOpcode())
-    return false;
-  switch (Src.getMachineOpcode()) {
-  case AMDGPU::FNEG_R600:
-    if (!Neg.getNode())
-      return false;
-    Src = Src.getOperand(0);
-    Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
-    return true;
-  case AMDGPU::FABS_R600:
-    if (!Abs.getNode())
-      return false;
-    Src = Src.getOperand(0);
-    Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
-    return true;
-  case AMDGPU::CONST_COPY: {
-    unsigned Opcode = ParentNode->getMachineOpcode();
-    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
-
-    if (!Sel.getNode())
-      return false;
-
-    SDValue CstOffset = Src.getOperand(0);
-    if (ParentNode->getValueType(0).isVector())
-      return false;
-
-    // Gather constants values
-    int SrcIndices[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
-    };
-    std::vector<unsigned> Consts;
-    for (int OtherSrcIdx : SrcIndices) {
-      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
-      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
-        continue;
-      if (HasDst) {
-        OtherSrcIdx--;
-        OtherSelIdx--;
-      }
-      if (RegisterSDNode *Reg =
-          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
-        if (Reg->getReg() == AMDGPU::ALU_CONST) {
-          ConstantSDNode *Cst
-            = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
-          Consts.push_back(Cst->getZExtValue());
-        }
-      }
-    }
-
-    ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
-    Consts.push_back(Cst->getZExtValue());
-    if (!TII->fitsConstReadLimitations(Consts)) {
-      return false;
-    }
-
-    Sel = CstOffset;
-    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
-    return true;
-  }
-  case AMDGPU::MOV_IMM_I32:
-  case AMDGPU::MOV_IMM_F32: {
-    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
-    uint64_t ImmValue = 0;
-
-
-    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
-      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
-      float FloatValue = FPC->getValueAPF().convertToFloat();
-      if (FloatValue == 0.0) {
-        ImmReg = AMDGPU::ZERO;
-      } else if (FloatValue == 0.5) {
-        ImmReg = AMDGPU::HALF;
-      } else if (FloatValue == 1.0) {
-        ImmReg = AMDGPU::ONE;
-      } else {
-        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
-      }
-    } else {
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
-      uint64_t Value = C->getZExtValue();
-      if (Value == 0) {
-        ImmReg = AMDGPU::ZERO;
-      } else if (Value == 1) {
-        ImmReg = AMDGPU::ONE_INT;
-      } else {
-        ImmValue = Value;
-      }
-    }
-
-    // Check that we aren't already using an immediate.
-    // XXX: It's possible for an instruction to have more than one
-    // immediate operand, but this is not supported yet.
-    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
-      if (!Imm.getNode())
-        return false;
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
-      assert(C);
-      if (C->getZExtValue())
-        return false;
-      Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
-    }
-    Src = DAG.getRegister(ImmReg, MVT::i32);
-    return true;
-  }
-  default:
-    return false;
-  }
-}
-
-
-/// \brief Fold the instructions after selecting them
-SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
-                                            SelectionDAG &DAG) const {
-  const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
-  if (!Node->isMachineOpcode())
-    return Node;
-  unsigned Opcode = Node->getMachineOpcode();
-  SDValue FakeOp;
-
-  std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
-
-  if (Opcode == AMDGPU::DOT_4) {
-    int OperandIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
-        };
-    int NegIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
-    };
-    int AbsIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
-    };
-    for (unsigned i = 0; i < 8; i++) {
-      if (OperandIdx[i] < 0)
-        return Node;
-      SDValue &Src = Ops[OperandIdx[i] - 1];
-      SDValue &Neg = Ops[NegIdx[i] - 1];
-      SDValue &Abs = Ops[AbsIdx[i] - 1];
-      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
-      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
-      if (HasDst)
-        SelIdx--;
-      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
-      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
-        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
-    }
-  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
-    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
-      SDValue &Src = Ops[i];
-      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
-        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
-    }
-  } else if (Opcode == AMDGPU::CLAMP_R600) {
-    SDValue Src = Node->getOperand(0);
-    if (!Src.isMachineOpcode() ||
-        !TII->hasInstrModifiers(Src.getMachineOpcode()))
-      return Node;
-    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
-        AMDGPU::OpName::clamp);
-    if (ClampIdx < 0)
-      return Node;
-    SDLoc DL(Node);
-    std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
-    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
-    return DAG.getMachineNode(Src.getMachineOpcode(), DL,
-                              Node->getVTList(), Ops);
-  } else {
-    if (!TII->hasInstrModifiers(Opcode))
-      return Node;
-    int OperandIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
-    };
-    int NegIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
-    };
-    int AbsIdx[] = {
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
-      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
-      -1
-    };
-    for (unsigned i = 0; i < 3; i++) {
-      if (OperandIdx[i] < 0)
-        return Node;
-      SDValue &Src = Ops[OperandIdx[i] - 1];
-      SDValue &Neg = Ops[NegIdx[i] - 1];
-      SDValue FakeAbs;
-      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
-      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
-      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
-      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
-      if (HasDst) {
-        SelIdx--;
-        ImmIdx--;
-      }
-      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
-      SDValue &Imm = Ops[ImmIdx];
-      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
-        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
-    }
-  }
-
-  return Node;
-}
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
deleted file mode 100644
index c06d3c4fd30..00000000000
--- a/lib/Target/R600/R600ISelLowering.h
+++ /dev/null
@@ -1,80 +0,0 @@
-//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief R600 DAG Lowering interface definition
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H
-#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H
-
-#include "AMDGPUISelLowering.h"
-
-namespace llvm {
-
-class R600InstrInfo;
-
-class R600TargetLowering : public AMDGPUTargetLowering {
-public:
-  R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
-  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
-      MachineBasicBlock * BB) const override;
-  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
-  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
-  void ReplaceNodeResults(SDNode * N,
-                          SmallVectorImpl<SDValue> &Results,
-                          SelectionDAG &DAG) const override;
-  SDValue LowerFormalArguments(
-                              SDValue Chain,
-                              CallingConv::ID CallConv,
-                              bool isVarArg,
-                              const SmallVectorImpl<ISD::InputArg> &Ins,
-                              SDLoc DL, SelectionDAG &DAG,
-                              SmallVectorImpl<SDValue> &InVals) const override;
-  EVT getSetCCResultType(LLVMContext &, EVT VT) const override;
-private:
-  unsigned Gen;
-  /// Each OpenCL kernel has nine implicit parameters that are stored in the
-  /// first nine dwords of a Vertex Buffer.  These implicit parameters are
-  /// lowered to load instructions which retrieve the values from the Vertex
-  /// Buffer.
-  SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
-                                 SDLoc DL, unsigned DwordOffset) const;
-
-  void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
-      MachineRegisterInfo & MRI, unsigned dword_offset) const;
-  SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG,
-                          SDLoc DL) const;
-  SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
-
-  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
-                        unsigned mainop, unsigned ovf) const;
-
-  SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
-                                          SelectionDAG &DAG) const;
-  void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
-                       unsigned &Channel, unsigned &PtrIncr) const;
-  bool isZero(SDValue Op) const;
-  SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
-};
-
-} // End namespace llvm;
-
-#endif
diff --git a/lib/Target/R600/R600InstrFormats.td b/lib/Target/R600/R600InstrFormats.td
deleted file mode 100644
index 0ffd485476e..00000000000
--- a/lib/Target/R600/R600InstrFormats.td
+++ /dev/null
@@ -1,495 +0,0 @@
-//===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// R600 Instruction format definitions.
-//
-//===----------------------------------------------------------------------===//
-
-class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
-                InstrItinClass itin>
-    : AMDGPUInst <outs, ins, asm, pattern> {
-
-  field bits<64> Inst;
-  bit Trig = 0;
-  bit Op3 = 0;
-  bit isVector = 0;
-  bits<2> FlagOperandIdx = 0;
-  bit Op1 = 0;
-  bit Op2 = 0;
-  bit LDS_1A = 0;
-  bit LDS_1A1D = 0;
-  bit HasNativeOperands = 0;
-  bit VTXInst = 0;
-  bit TEXInst = 0;
-  bit ALUInst = 0;
-  bit IsExport = 0;
-  bit LDS_1A2D = 0;
-
-  let Namespace = "AMDGPU";
-  let OutOperandList = outs;
-  let InOperandList = ins;
-  let AsmString = asm;
-  let Pattern = pattern;
-  let Itinerary = itin;
-
-  // No AsmMatcher support.
-  let isCodeGenOnly = 1;
-
-  let TSFlags{4} = Trig;
-  let TSFlags{5} = Op3;
-
-  // Vector instructions are instructions that must fill all slots in an
-  // instruction group
-  let TSFlags{6} = isVector;
-  let TSFlags{8-7} = FlagOperandIdx;
-  let TSFlags{9} = HasNativeOperands;
-  let TSFlags{10} = Op1;
-  let TSFlags{11} = Op2;
-  let TSFlags{12} = VTXInst;
-  let TSFlags{13} = TEXInst;
-  let TSFlags{14} = ALUInst;
-  let TSFlags{15} = LDS_1A;
-  let TSFlags{16} = LDS_1A1D;
-  let TSFlags{17} = IsExport;
-  let TSFlags{18} = LDS_1A2D;
-}
-
-//===----------------------------------------------------------------------===//
-// ALU instructions
-//===----------------------------------------------------------------------===//
-
-class R600_ALU_LDS_Word0 {
-  field bits<32> Word0;
-
-  bits<11> src0;
-  bits<1>  src0_rel;
-  bits<11> src1;
-  bits<1>  src1_rel;
-  bits<3>  index_mode = 0;
-  bits<2>  pred_sel;
-  bits<1>  last;
-
-  bits<9>  src0_sel  = src0{8-0};
-  bits<2>  src0_chan = src0{10-9};
-  bits<9>  src1_sel  = src1{8-0};
-  bits<2>  src1_chan = src1{10-9};
-
-  let Word0{8-0}   = src0_sel;
-  let Word0{9}     = src0_rel;
-  let Word0{11-10} = src0_chan;
-  let Word0{21-13} = src1_sel;
-  let Word0{22}    = src1_rel;
-  let Word0{24-23} = src1_chan;
-  let Word0{28-26} = index_mode;
-  let Word0{30-29} = pred_sel;
-  let Word0{31}    = last;
-}
-
-class R600ALU_Word0 : R600_ALU_LDS_Word0 {
-
-  bits<1>  src0_neg;
-  bits<1>  src1_neg;
-
-  let Word0{12}    = src0_neg;
-  let Word0{25}    = src1_neg;
-}
-
-class R600ALU_Word1 {
-  field bits<32> Word1;
-
-  bits<11> dst;
-  bits<3>  bank_swizzle;
-  bits<1>  dst_rel;
-  bits<1>  clamp;
-
-  bits<7>  dst_sel  = dst{6-0};
-  bits<2>  dst_chan = dst{10-9};
-
-  let Word1{20-18} = bank_swizzle;
-  let Word1{27-21} = dst_sel;
-  let Word1{28}    = dst_rel;
-  let Word1{30-29} = dst_chan;
-  let Word1{31}    = clamp;
-}
-
-class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{
-
-  bits<1>  src0_abs;
-  bits<1>  src1_abs;
-  bits<1>  update_exec_mask;
-  bits<1>  update_pred;
-  bits<1>  write;
-  bits<2>  omod;
-
-  let Word1{0}     = src0_abs;
-  let Word1{1}     = src1_abs;
-  let Word1{2}     = update_exec_mask;
-  let Word1{3}     = update_pred;
-  let Word1{4}     = write;
-  let Word1{6-5}   = omod;
-  let Word1{17-7}  = alu_inst;
-}
-
-class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
-
-  bits<11> src2;
-  bits<1>  src2_rel;
-  bits<1>  src2_neg;
-
-  bits<9>  src2_sel = src2{8-0};
-  bits<2>  src2_chan = src2{10-9};
-
-  let Word1{8-0}   = src2_sel;
-  let Word1{9}     = src2_rel;
-  let Word1{11-10} = src2_chan;
-  let Word1{12}    = src2_neg;
-  let Word1{17-13} = alu_inst;
-}
-
-class R600LDS_Word1 {
-  field bits<32> Word1;
-
-  bits<11> src2;
-  bits<9>  src2_sel  = src2{8-0};
-  bits<2>  src2_chan = src2{10-9};
-  bits<1>  src2_rel;
-  // offset specifies the stride offset to the second set of data to be read
-  // from.  This is a dword offset.
-  bits<5>  alu_inst = 17; // OP3_INST_LDS_IDX_OP
-  bits<3>  bank_swizzle;
-  bits<6>  lds_op;
-  bits<2>  dst_chan = 0;
-
-  let Word1{8-0}   = src2_sel;
-  let Word1{9}     = src2_rel;
-  let Word1{11-10} = src2_chan;
-  let Word1{17-13} = alu_inst;
-  let Word1{20-18} = bank_swizzle;
-  let Word1{26-21} = lds_op;
-  let Word1{30-29} = dst_chan;
-}
-
-
-/*
-XXX: R600 subtarget uses a slightly different encoding than the other
-subtargets.  We currently handle this in R600MCCodeEmitter, but we may
-want to use these instruction classes in the future.
-
-class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 {
-
-  bits<1>  fog_merge;
-  bits<10> alu_inst;
-
-  let Inst{37}    = fog_merge;
-  let Inst{39-38} = omod;
-  let Inst{49-40} = alu_inst;
-}
-
-class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 {
-
-  bits<11> alu_inst;
-
-  let Inst{38-37} = omod;
-  let Inst{49-39} = alu_inst;
-}
-*/
-
-//===----------------------------------------------------------------------===//
-// Vertex Fetch instructions
-//===----------------------------------------------------------------------===//
-
-class VTX_WORD0 {
-  field bits<32> Word0;
-  bits<7> src_gpr;
-  bits<5> VC_INST;
-  bits<2> FETCH_TYPE;
-  bits<1> FETCH_WHOLE_QUAD;
-  bits<8> BUFFER_ID;
-  bits<1> SRC_REL;
-  bits<2> SRC_SEL_X;
-
-  let Word0{4-0}   = VC_INST;
-  let Word0{6-5}   = FETCH_TYPE;
-  let Word0{7}     = FETCH_WHOLE_QUAD;
-  let Word0{15-8}  = BUFFER_ID;
-  let Word0{22-16} = src_gpr;
-  let Word0{23}    = SRC_REL;
-  let Word0{25-24} = SRC_SEL_X;
-}
-
-class VTX_WORD0_eg : VTX_WORD0 {
-
-  bits<6> MEGA_FETCH_COUNT;
-
-  let Word0{31-26} = MEGA_FETCH_COUNT;
-}
-
-class VTX_WORD0_cm : VTX_WORD0 {
-
-  bits<2> SRC_SEL_Y;
-  bits<2> STRUCTURED_READ;
-  bits<1> LDS_REQ;
-  bits<1> COALESCED_READ;
-
-  let Word0{27-26} = SRC_SEL_Y;
-  let Word0{29-28} = STRUCTURED_READ;
-  let Word0{30}    = LDS_REQ;
-  let Word0{31}    = COALESCED_READ;
-}
-
-class VTX_WORD1_GPR {
-  field bits<32> Word1;
-  bits<7> dst_gpr;
-  bits<1> DST_REL;
-  bits<3> DST_SEL_X;
-  bits<3> DST_SEL_Y;
-  bits<3> DST_SEL_Z;
-  bits<3> DST_SEL_W;
-  bits<1> USE_CONST_FIELDS;
-  bits<6> DATA_FORMAT;
-  bits<2> NUM_FORMAT_ALL;
-  bits<1> FORMAT_COMP_ALL;
-  bits<1> SRF_MODE_ALL;
-
-  let Word1{6-0} = dst_gpr;
-  let Word1{7}    = DST_REL;
-  let Word1{8}    = 0; // Reserved
-  let Word1{11-9} = DST_SEL_X;
-  let Word1{14-12} = DST_SEL_Y;
-  let Word1{17-15} = DST_SEL_Z;
-  let Word1{20-18} = DST_SEL_W;
-  let Word1{21}    = USE_CONST_FIELDS;
-  let Word1{27-22} = DATA_FORMAT;
-  let Word1{29-28} = NUM_FORMAT_ALL;
-  let Word1{30}    = FORMAT_COMP_ALL;
-  let Word1{31}    = SRF_MODE_ALL;
-}
-
-//===----------------------------------------------------------------------===//
-// Texture fetch instructions
-//===----------------------------------------------------------------------===//
-
-class TEX_WORD0 {
-  field bits<32> Word0;
-
-  bits<5> TEX_INST;
-  bits<2> INST_MOD;
-  bits<1> FETCH_WHOLE_QUAD;
-  bits<8> RESOURCE_ID;
-  bits<7> SRC_GPR;
-  bits<1> SRC_REL;
-  bits<1> ALT_CONST;
-  bits<2> RESOURCE_INDEX_MODE;
-  bits<2> SAMPLER_INDEX_MODE;
-
-  let Word0{4-0} = TEX_INST;
-  let Word0{6-5} = INST_MOD;
-  let Word0{7} = FETCH_WHOLE_QUAD;
-  let Word0{15-8} = RESOURCE_ID;
-  let Word0{22-16} = SRC_GPR;
-  let Word0{23} = SRC_REL;
-  let Word0{24} = ALT_CONST;
-  let Word0{26-25} = RESOURCE_INDEX_MODE;
-  let Word0{28-27} = SAMPLER_INDEX_MODE;
-}
-
-class TEX_WORD1 {
-  field bits<32> Word1;
-
-  bits<7> DST_GPR;
-  bits<1> DST_REL;
-  bits<3> DST_SEL_X;
-  bits<3> DST_SEL_Y;
-  bits<3> DST_SEL_Z;
-  bits<3> DST_SEL_W;
-  bits<7> LOD_BIAS;
-  bits<1> COORD_TYPE_X;
-  bits<1> COORD_TYPE_Y;
-  bits<1> COORD_TYPE_Z;
-  bits<1> COORD_TYPE_W;
-
-  let Word1{6-0} = DST_GPR;
-  let Word1{7} = DST_REL;
-  let Word1{11-9} = DST_SEL_X;
-  let Word1{14-12} = DST_SEL_Y;
-  let Word1{17-15} = DST_SEL_Z;
-  let Word1{20-18} = DST_SEL_W;
-  let Word1{27-21} = LOD_BIAS;
-  let Word1{28} = COORD_TYPE_X;
-  let Word1{29} = COORD_TYPE_Y;
-  let Word1{30} = COORD_TYPE_Z;
-  let Word1{31} = COORD_TYPE_W;
-}
-
-class TEX_WORD2 {
-  field bits<32> Word2;
-
-  bits<5> OFFSET_X;
-  bits<5> OFFSET_Y;
-  bits<5> OFFSET_Z;
-  bits<5> SAMPLER_ID;
-  bits<3> SRC_SEL_X;
-  bits<3> SRC_SEL_Y;
-  bits<3> SRC_SEL_Z;
-  bits<3> SRC_SEL_W;
-
-  let Word2{4-0} = OFFSET_X;
-  let Word2{9-5} = OFFSET_Y;
-  let Word2{14-10} = OFFSET_Z;
-  let Word2{19-15} = SAMPLER_ID;
-  let Word2{22-20} = SRC_SEL_X;
-  let Word2{25-23} = SRC_SEL_Y;
-  let Word2{28-26} = SRC_SEL_Z;
-  let Word2{31-29} = SRC_SEL_W;
-}
-
-//===----------------------------------------------------------------------===//
-// Control Flow Instructions
-//===----------------------------------------------------------------------===//
-
-class CF_WORD1_R600 {
-  field bits<32> Word1;
-
-  bits<3> POP_COUNT;
-  bits<5> CF_CONST;
-  bits<2> COND;
-  bits<3> COUNT;
-  bits<6> CALL_COUNT;
-  bits<1> COUNT_3;
-  bits<1> END_OF_PROGRAM;
-  bits<1> VALID_PIXEL_MODE;
-  bits<7> CF_INST;
-  bits<1> WHOLE_QUAD_MODE;
-  bits<1> BARRIER;
-
-  let Word1{2-0} = POP_COUNT;
-  let Word1{7-3} = CF_CONST;
-  let Word1{9-8} = COND;
-  let Word1{12-10} = COUNT;
-  let Word1{18-13} = CALL_COUNT;
-  let Word1{19} = COUNT_3;
-  let Word1{21} = END_OF_PROGRAM;
-  let Word1{22} = VALID_PIXEL_MODE;
-  let Word1{29-23} = CF_INST;
-  let Word1{30} = WHOLE_QUAD_MODE;
-  let Word1{31} = BARRIER;
-}
-
-class CF_WORD0_EG {
-  field bits<32> Word0;
-
-  bits<24> ADDR;
-  bits<3> JUMPTABLE_SEL;
-
-  let Word0{23-0} = ADDR;
-  let Word0{26-24} = JUMPTABLE_SEL;
-}
-
-class CF_WORD1_EG {
-  field bits<32> Word1;
-
-  bits<3> POP_COUNT;
-  bits<5> CF_CONST;
-  bits<2> COND;
-  bits<6> COUNT;
-  bits<1> VALID_PIXEL_MODE;
-  bits<1> END_OF_PROGRAM;
-  bits<8> CF_INST;
-  bits<1> BARRIER;
-
-  let Word1{2-0} = POP_COUNT;
-  let Word1{7-3} = CF_CONST;
-  let Word1{9-8} = COND;
-  let Word1{15-10} = COUNT;
-  let Word1{20} = VALID_PIXEL_MODE;
-  let Word1{21} = END_OF_PROGRAM;
-  let Word1{29-22} = CF_INST;
-  let Word1{31} = BARRIER;
-}
-
-class CF_ALU_WORD0 {
-  field bits<32> Word0;
-
-  bits<22> ADDR;
-  bits<4> KCACHE_BANK0;
-  bits<4> KCACHE_BANK1;
-  bits<2> KCACHE_MODE0;
-
-  let Word0{21-0} = ADDR;
-  let Word0{25-22} = KCACHE_BANK0;
-  let Word0{29-26} = KCACHE_BANK1;
-  let Word0{31-30} = KCACHE_MODE0;
-}
-
-class CF_ALU_WORD1 {
-  field bits<32> Word1;
-
-  bits<2> KCACHE_MODE1;
-  bits<8> KCACHE_ADDR0;
-  bits<8> KCACHE_ADDR1;
-  bits<7> COUNT;
-  bits<1> ALT_CONST;
-  bits<4> CF_INST;
-  bits<1> WHOLE_QUAD_MODE;
-  bits<1> BARRIER;
-
-  let Word1{1-0} = KCACHE_MODE1;
-  let Word1{9-2} = KCACHE_ADDR0;
-  let Word1{17-10} = KCACHE_ADDR1;
-  let Word1{24-18} = COUNT;
-  let Word1{25} = ALT_CONST;
-  let Word1{29-26} = CF_INST;
-  let Word1{30} = WHOLE_QUAD_MODE;
-  let Word1{31} = BARRIER;
-}
-
-class CF_ALLOC_EXPORT_WORD0_RAT {
-  field bits<32> Word0;
-
-  bits<4> rat_id;
-  bits<6> rat_inst;
-  bits<2> rim;
-  bits<2> type;
-  bits<7> rw_gpr;
-  bits<1> rw_rel;
-  bits<7> index_gpr;
-  bits<2> elem_size;
-
-  let Word0{3-0}   = rat_id;
-  let Word0{9-4}   = rat_inst;
-  let Word0{10}    = 0; // Reserved
-  let Word0{12-11} = rim;
-  let Word0{14-13} = type;
-  let Word0{21-15} = rw_gpr;
-  let Word0{22}    = rw_rel;
-  let Word0{29-23} = index_gpr;
-  let Word0{31-30} = elem_size;
-}
-
-class CF_ALLOC_EXPORT_WORD1_BUF {
-  field bits<32> Word1;
-
-  bits<12> array_size;
-  bits<4>  comp_mask;
-  bits<4>  burst_count;
-  bits<1>  vpm;
-  bits<1>  eop;
-  bits<8>  cf_inst;
-  bits<1>  mark;
-  bits<1>  barrier;
-
-  let Word1{11-0} = array_size;
-  let Word1{15-12} = comp_mask;
-  let Word1{19-16} = burst_count;
-  let Word1{20}    = vpm;
-  let Word1{21}    = eop;
-  let Word1{29-22} = cf_inst;
-  let Word1{30}    = mark;
-  let Word1{31}    = barrier;
-}
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
deleted file mode 100644
index 5ef883cbcad..00000000000
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ /dev/null
@@ -1,1435 +0,0 @@
-//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief R600 Implementation of TargetInstrInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#include "R600InstrInfo.h"
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDGPUTargetMachine.h"
-#include "R600Defines.h"
-#include "R600MachineFunctionInfo.h"
-#include "R600RegisterInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
-using namespace llvm;
-
-#define GET_INSTRINFO_CTOR_DTOR
-#include "AMDGPUGenDFAPacketizer.inc"
-
-R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st)
-    : AMDGPUInstrInfo(st), RI() {}
-
-const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const {
-  return RI;
-}
-
-bool R600InstrInfo::isTrig(const MachineInstr &MI) const {
-  return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG;
-}
-
-bool R600InstrInfo::isVector(const MachineInstr &MI) const {
-  return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
-}
-
-void
-R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const {
-  unsigned VectorComponents = 0;
-  if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
-      AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
-      (AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
-       AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
-    VectorComponents = 4;
-  } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
-            AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
-            (AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
-             AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
-    VectorComponents = 2;
-  }
-
-  if (VectorComponents > 0) {
-    for (unsigned I = 0; I < VectorComponents; I++) {
-      unsigned SubRegIndex = RI.getSubRegFromChannel(I);
-      buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
-                              RI.getSubReg(DestReg, SubRegIndex),
-                              RI.getSubReg(SrcReg, SubRegIndex))
-                              .addReg(DestReg,
-                                      RegState::Define | RegState::Implicit);
-    }
-  } else {
-    MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
-                                                  DestReg, SrcReg);
-    NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0))
-                                    .setIsKill(KillSrc);
-  }
-}
-
-/// \returns true if \p MBBI can be moved into a new basic.
-bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator MBBI) const {
-  for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(),
-                                        E = MBBI->operands_end(); I != E; ++I) {
-    if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) &&
-        I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg()))
-      return false;
-  }
-  return true;
-}
-
-bool R600InstrInfo::isMov(unsigned Opcode) const {
-
-
-  switch(Opcode) {
-  default: return false;
-  case AMDGPU::MOV:
-  case AMDGPU::MOV_IMM_F32:
-  case AMDGPU::MOV_IMM_I32:
-    return true;
-  }
-}
-
-// Some instructions act as place holders to emulate operations that the GPU
-// hardware does automatically. This function can be used to check if
-// an opcode falls into this category.
-bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const {
-  switch (Opcode) {
-  default: return false;
-  case AMDGPU::RETURN:
-    return true;
-  }
-}
-
-bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
-  return false;
-}
-
-bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
-  switch(Opcode) {
-    default: return false;
-    case AMDGPU::CUBE_r600_pseudo:
-    case AMDGPU::CUBE_r600_real:
-    case AMDGPU::CUBE_eg_pseudo:
-    case AMDGPU::CUBE_eg_real:
-      return true;
-  }
-}
-
-bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
-  unsigned TargetFlags = get(Opcode).TSFlags;
-
-  return (TargetFlags & R600_InstFlag::ALU_INST);
-}
-
-bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const {
-  unsigned TargetFlags = get(Opcode).TSFlags;
-
-  return ((TargetFlags & R600_InstFlag::OP1) |
-          (TargetFlags & R600_InstFlag::OP2) |
-          (TargetFlags & R600_InstFlag::OP3));
-}
-
-bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
-  unsigned TargetFlags = get(Opcode).TSFlags;
-
-  return ((TargetFlags & R600_InstFlag::LDS_1A) |
-          (TargetFlags & R600_InstFlag::LDS_1A1D) |
-          (TargetFlags & R600_InstFlag::LDS_1A2D));
-}
-
-bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const {
-  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1;
-}
-
-bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const {
-  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1;
-}
-
-bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const {
-  if (isALUInstr(MI->getOpcode()))
-    return true;
-  if (isVector(*MI) || isCubeOp(MI->getOpcode()))
-    return true;
-  switch (MI->getOpcode()) {
-  case AMDGPU::PRED_X:
-  case AMDGPU::INTERP_PAIR_XY:
-  case AMDGPU::INTERP_PAIR_ZW:
-  case AMDGPU::INTERP_VEC_LOAD:
-  case AMDGPU::COPY:
-  case AMDGPU::DOT_4:
-    return true;
-  default:
-    return false;
-  }
-}
-
-bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
-  if (ST.hasCaymanISA())
-    return false;
-  return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU);
-}
-
-bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const {
-  return isTransOnly(MI->getOpcode());
-}
-
-bool R600InstrInfo::isVectorOnly(unsigned Opcode) const {
-  return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU);
-}
-
-bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const {
-  return isVectorOnly(MI->getOpcode());
-}
-
-bool R600InstrInfo::isExport(unsigned Opcode) const {
-  return (get(Opcode).TSFlags & R600_InstFlag::IS_EXPORT);
-}
-
-bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
-  return ST.hasVertexCache() && IS_VTX(get(Opcode));
-}
-
-bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const {
-  const MachineFunction *MF = MI->getParent()->getParent();
-  const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
-  return MFI->getShaderType() != ShaderType::COMPUTE &&
-    usesVertexCache(MI->getOpcode());
-}
-
-bool R600InstrInfo::usesTextureCache(unsigned Opcode) const {
-  return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode));
-}
-
-bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const {
-  const MachineFunction *MF = MI->getParent()->getParent();
-  const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
-  return (MFI->getShaderType() == ShaderType::COMPUTE &&
-          usesVertexCache(MI->getOpcode())) ||
-    usesTextureCache(MI->getOpcode());
-}
-
-bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
-  switch (Opcode) {
-  case AMDGPU::KILLGT:
-  case AMDGPU::GROUP_BARRIER:
-    return true;
-  default:
-    return false;
-  }
-}
-
-bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const {
-  return  MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
-}
-
-bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const {
-  return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
-}
-
-bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const {
-  if (!isALUInstr(MI->getOpcode())) {
-    return false;
-  }
-  for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
-                                        E = MI->operands_end(); I != E; ++I) {
-    if (!I->isReg() || !I->isUse() ||
-        TargetRegisterInfo::isVirtualRegister(I->getReg()))
-      continue;
-
-    if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
-      return true;
-  }
-  return false;
-}
-
-int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const {
-  static const unsigned OpTable[] = {
-    AMDGPU::OpName::src0,
-    AMDGPU::OpName::src1,
-    AMDGPU::OpName::src2
-  };
-
-  assert (SrcNum < 3);
-  return getOperandIdx(Opcode, OpTable[SrcNum]);
-}
-
-int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const {
-  static const unsigned SrcSelTable[][2] = {
-    {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
-    {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
-    {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
-    {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
-    {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
-    {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
-    {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
-    {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
-    {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
-    {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
-    {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}
-  };
-
-  for (const auto &Row : SrcSelTable) {
-    if (getOperandIdx(Opcode, Row[0]) == (int)SrcIdx) {
-      return getOperandIdx(Opcode, Row[1]);
-    }
-  }
-  return -1;
-}
-
-SmallVector<std::pair<MachineOperand *, int64_t>, 3>
-R600InstrInfo::getSrcs(MachineInstr *MI) const {
-  SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
-
-  if (MI->getOpcode() == AMDGPU::DOT_4) {
-    static const unsigned OpTable[8][2] = {
-      {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
-      {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
-      {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
-      {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
-      {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
-      {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
-      {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
-      {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W},
-    };
-
-    for (unsigned j = 0; j < 8; j++) {
-      MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
-                                                        OpTable[j][0]));
-      unsigned Reg = MO.getReg();
-      if (Reg == AMDGPU::ALU_CONST) {
-        unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(),
-                                                    OpTable[j][1])).getImm();
-        Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
-        continue;
-      }
-
-    }
-    return Result;
-  }
-
-  static const unsigned OpTable[3][2] = {
-    {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
-    {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
-    {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
-  };
-
-  for (unsigned j = 0; j < 3; j++) {
-    int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
-    if (SrcIdx < 0)
-      break;
-    MachineOperand &MO = MI->getOperand(SrcIdx);
-    unsigned Reg = MI->getOperand(SrcIdx).getReg();
-    if (Reg == AMDGPU::ALU_CONST) {
-      unsigned Sel = MI->getOperand(
-          getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
-      Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
-      continue;
-    }
-    if (Reg == AMDGPU::ALU_LITERAL_X) {
-      unsigned Imm = MI->getOperand(
-          getOperandIdx(MI->getOpcode(), AMDGPU::OpName::literal)).getImm();
-      Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Imm));
-      continue;
-    }
-    Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, 0));
-  }
-  return Result;
-}
-
-std::vector<std::pair<int, unsigned> >
-R600InstrInfo::ExtractSrcs(MachineInstr *MI,
-                           const DenseMap<unsigned, unsigned> &PV,
-                           unsigned &ConstCount) const {
-  ConstCount = 0;
-  ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI);
-  const std::pair<int, unsigned> DummyPair(-1, 0);
-  std::vector<std::pair<int, unsigned> > Result;
-  unsigned i = 0;
-  for (unsigned n = Srcs.size(); i < n; ++i) {
-    unsigned Reg = Srcs[i].first->getReg();
-    unsigned Index = RI.getEncodingValue(Reg) & 0xff;
-    if (Reg == AMDGPU::OQAP) {
-      Result.push_back(std::pair<int, unsigned>(Index, 0));
-    }
-    if (PV.find(Reg) != PV.end()) {
-      // 255 is used to tells its a PS/PV reg
-      Result.push_back(std::pair<int, unsigned>(255, 0));
-      continue;
-    }
-    if (Index > 127) {
-      ConstCount++;
-      Result.push_back(DummyPair);
-      continue;
-    }
-    unsigned Chan = RI.getHWRegChan(Reg);
-    Result.push_back(std::pair<int, unsigned>(Index, Chan));
-  }
-  for (; i < 3; ++i)
-    Result.push_back(DummyPair);
-  return Result;
-}
-
-static std::vector<std::pair<int, unsigned> >
-Swizzle(std::vector<std::pair<int, unsigned> > Src,
-        R600InstrInfo::BankSwizzle Swz) {
-  if (Src[0] == Src[1])
-    Src[1].first = -1;
-  switch (Swz) {
-  case R600InstrInfo::ALU_VEC_012_SCL_210:
-    break;
-  case R600InstrInfo::ALU_VEC_021_SCL_122:
-    std::swap(Src[1], Src[2]);
-    break;
-  case R600InstrInfo::ALU_VEC_102_SCL_221:
-    std::swap(Src[0], Src[1]);
-    break;
-  case R600InstrInfo::ALU_VEC_120_SCL_212:
-    std::swap(Src[0], Src[1]);
-    std::swap(Src[0], Src[2]);
-    break;
-  case R600InstrInfo::ALU_VEC_201:
-    std::swap(Src[0], Src[2]);
-    std::swap(Src[0], Src[1]);
-    break;
-  case R600InstrInfo::ALU_VEC_210:
-    std::swap(Src[0], Src[2]);
-    break;
-  }
-  return Src;
-}
-
-static unsigned
-getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
-  switch (Swz) {
-  case R600InstrInfo::ALU_VEC_012_SCL_210: {
-    unsigned Cycles[3] = { 2, 1, 0};
-    return Cycles[Op];
-  }
-  case R600InstrInfo::ALU_VEC_021_SCL_122: {
-    unsigned Cycles[3] = { 1, 2, 2};
-    return Cycles[Op];
-  }
-  case R600InstrInfo::ALU_VEC_120_SCL_212: {
-    unsigned Cycles[3] = { 2, 1, 2};
-    return Cycles[Op];
-  }
-  case R600InstrInfo::ALU_VEC_102_SCL_221: {
-    unsigned Cycles[3] = { 2, 2, 1};
-    return Cycles[Op];
-  }
-  default:
-    llvm_unreachable("Wrong Swizzle for Trans Slot");
-    return 0;
-  }
-}
-
-/// returns how many MIs (whose inputs are represented by IGSrcs) can be packed
-/// in the same Instruction Group while meeting read port limitations given a
-/// Swz swizzle sequence.
-unsigned  R600InstrInfo::isLegalUpTo(
-    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
-    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
-    const std::vector<std::pair<int, unsigned> > &TransSrcs,
-    R600InstrInfo::BankSwizzle TransSwz) const {
-  int Vector[4][3];
-  memset(Vector, -1, sizeof(Vector));
-  for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) {
-    const std::vector<std::pair<int, unsigned> > &Srcs =
-        Swizzle(IGSrcs[i], Swz[i]);
-    for (unsigned j = 0; j < 3; j++) {
-      const std::pair<int, unsigned> &Src = Srcs[j];
-      if (Src.first < 0 || Src.first == 255)
-        continue;
-      if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
-        if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 &&
-            Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) {
-            // The value from output queue A (denoted by register OQAP) can
-            // only be fetched during the first cycle.
-            return false;
-        }
-        // OQAP does not count towards the normal read port restrictions
-        continue;
-      }
-      if (Vector[Src.second][j] < 0)
-        Vector[Src.second][j] = Src.first;
-      if (Vector[Src.second][j] != Src.first)
-        return i;
-    }
-  }
-  // Now check Trans Alu
-  for (unsigned i = 0, e = TransSrcs.size(); i < e; ++i) {
-    const std::pair<int, unsigned> &Src = TransSrcs[i];
-    unsigned Cycle = getTransSwizzle(TransSwz, i);
-    if (Src.first < 0)
-      continue;
-    if (Src.first == 255)
-      continue;
-    if (Vector[Src.second][Cycle] < 0)
-      Vector[Src.second][Cycle] = Src.first;
-    if (Vector[Src.second][Cycle] != Src.first)
-      return IGSrcs.size() - 1;
-  }
-  return IGSrcs.size();
-}
-
-/// Given a swizzle sequence SwzCandidate and an index Idx, returns the next
-/// (in lexicographic term) swizzle sequence assuming that all swizzles after
-/// Idx can be skipped
-static bool
-NextPossibleSolution(
-    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
-    unsigned Idx) {
-  assert(Idx < SwzCandidate.size());
-  int ResetIdx = Idx;
-  while (ResetIdx > -1 && SwzCandidate[ResetIdx] == R600InstrInfo::ALU_VEC_210)
-    ResetIdx --;
-  for (unsigned i = ResetIdx + 1, e = SwzCandidate.size(); i < e; i++) {
-    SwzCandidate[i] = R600InstrInfo::ALU_VEC_012_SCL_210;
-  }
-  if (ResetIdx == -1)
-    return false;
-  int NextSwizzle = SwzCandidate[ResetIdx] + 1;
-  SwzCandidate[ResetIdx] = (R600InstrInfo::BankSwizzle)NextSwizzle;
-  return true;
-}
-
-/// Enumerate all possible Swizzle sequence to find one that can meet all
-/// read port requirements.
-bool R600InstrInfo::FindSwizzleForVectorSlot(
-    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
-    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
-    const std::vector<std::pair<int, unsigned> > &TransSrcs,
-    R600InstrInfo::BankSwizzle TransSwz) const {
-  unsigned ValidUpTo = 0;
-  do {
-    ValidUpTo = isLegalUpTo(IGSrcs, SwzCandidate, TransSrcs, TransSwz);
-    if (ValidUpTo == IGSrcs.size())
-      return true;
-  } while (NextPossibleSolution(SwzCandidate, ValidUpTo));
-  return false;
-}
-
-/// Instructions in Trans slot can't read gpr at cycle 0 if they also read
-/// a const, and can't read a gpr at cycle 1 if they read 2 const.
-static bool
-isConstCompatible(R600InstrInfo::BankSwizzle TransSwz,
-                  const std::vector<std::pair<int, unsigned> > &TransOps,
-                  unsigned ConstCount) {
-  // TransALU can't read 3 constants
-  if (ConstCount > 2)
-    return false;
-  for (unsigned i = 0, e = TransOps.size(); i < e; ++i) {
-    const std::pair<int, unsigned> &Src = TransOps[i];
-    unsigned Cycle = getTransSwizzle(TransSwz, i);
-    if (Src.first < 0)
-      continue;
-    if (ConstCount > 0 && Cycle == 0)
-      return false;
-    if (ConstCount > 1 && Cycle == 1)
-      return false;
-  }
-  return true;
-}
-
-bool
-R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
-                                       const DenseMap<unsigned, unsigned> &PV,
-                                       std::vector<BankSwizzle> &ValidSwizzle,
-                                       bool isLastAluTrans)
-    const {
-  //Todo : support shared src0 - src1 operand
-
-  std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs;
-  ValidSwizzle.clear();
-  unsigned ConstCount;
-  BankSwizzle TransBS = ALU_VEC_012_SCL_210;
-  for (unsigned i = 0, e = IG.size(); i < e; ++i) {
-    IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount));
-    unsigned Op = getOperandIdx(IG[i]->getOpcode(),
-        AMDGPU::OpName::bank_swizzle);
-    ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
-        IG[i]->getOperand(Op).getImm());
-  }
-  std::vector<std::pair<int, unsigned> > TransOps;
-  if (!isLastAluTrans)
-    return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS);
-
-  TransOps = std::move(IGSrcs.back());
-  IGSrcs.pop_back();
-  ValidSwizzle.pop_back();
-
-  static const R600InstrInfo::BankSwizzle TransSwz[] = {
-    ALU_VEC_012_SCL_210,
-    ALU_VEC_021_SCL_122,
-    ALU_VEC_120_SCL_212,
-    ALU_VEC_102_SCL_221
-  };
-  for (unsigned i = 0; i < 4; i++) {
-    TransBS = TransSwz[i];
-    if (!isConstCompatible(TransBS, TransOps, ConstCount))
-      continue;
-    bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps,
-        TransBS);
-    if (Result) {
-      ValidSwizzle.push_back(TransBS);
-      return true;
-    }
-  }
-
-  return false;
-}
-
-
-bool
-R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
-    const {
-  assert (Consts.size() <= 12 && "Too many operands in instructions group");
-  unsigned Pair1 = 0, Pair2 = 0;
-  for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
-    unsigned ReadConstHalf = Consts[i] & 2;
-    unsigned ReadConstIndex = Consts[i] & (~3);
-    unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf;
-    if (!Pair1) {
-      Pair1 = ReadHalfConst;
-      continue;
-    }
-    if (Pair1 == ReadHalfConst)
-      continue;
-    if (!Pair2) {
-      Pair2 = ReadHalfConst;
-      continue;
-    }
-    if (Pair2 != ReadHalfConst)
-      return false;
-  }
-  return true;
-}
-
-bool
-R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
-    const {
-  std::vector<unsigned> Consts;
-  SmallSet<int64_t, 4> Literals;
-  for (unsigned i = 0, n = MIs.size(); i < n; i++) {
-    MachineInstr *MI = MIs[i];
-    if (!isALUInstr(MI->getOpcode()))
-      continue;
-
-    ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI);
-
-    for (unsigned j = 0, e = Srcs.size(); j < e; j++) {
-      std::pair<MachineOperand *, unsigned> Src = Srcs[j];
-      if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X)
-        Literals.insert(Src.second);
-      if (Literals.size() > 4)
-        return false;
-      if (Src.first->getReg() == AMDGPU::ALU_CONST)
-        Consts.push_back(Src.second);
-      if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) ||
-          AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) {
-        unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff;
-        unsigned Chan = RI.getHWRegChan(Src.first->getReg());
-        Consts.push_back((Index << 2) | Chan);
-      }
-    }
-  }
-  return fitsConstReadLimitations(Consts);
-}
-
-DFAPacketizer *
-R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const {
-  const InstrItineraryData *II = STI.getInstrItineraryData();
-  return static_cast<const AMDGPUSubtarget &>(STI).createDFAPacketizer(II);
-}
-
-static bool
-isPredicateSetter(unsigned Opcode) {
-  switch (Opcode) {
-  case AMDGPU::PRED_X:
-    return true;
-  default:
-    return false;
-  }
-}
-
-static MachineInstr *
-findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator I) {
-  while (I != MBB.begin()) {
-    --I;
-    MachineInstr *MI = I;
-    if (isPredicateSetter(MI->getOpcode()))
-      return MI;
-  }
-
-  return nullptr;
-}
-
-static
-bool isJump(unsigned Opcode) {
-  return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
-}
-
-static bool isBranch(unsigned Opcode) {
-  return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 ||
-      Opcode == AMDGPU::BRANCH_COND_f32;
-}
-
-bool
-R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
-                             MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify) const {
-  // Most of the following comes from the ARM implementation of AnalyzeBranch
-
-  // If the block has no terminators, it just falls into the block after it.
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin())
-    return false;
-  --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return false;
-    --I;
-  }
-  // AMDGPU::BRANCH* instructions are only available after isel and are not
-  // handled
-  if (isBranch(I->getOpcode()))
-    return true;
-  if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) {
-    return false;
-  }
-
-  // Remove successive JUMP
-  while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) {
-      MachineBasicBlock::iterator PriorI = std::prev(I);
-      if (AllowModify)
-        I->removeFromParent();
-      I = PriorI;
-  }
-  MachineInstr *LastInst = I;
-
-  // If there is only one terminator instruction, process it.
-  unsigned LastOpc = LastInst->getOpcode();
-  if (I == MBB.begin() ||
-          !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) {
-    if (LastOpc == AMDGPU::JUMP) {
-      TBB = LastInst->getOperand(0).getMBB();
-      return false;
-    } else if (LastOpc == AMDGPU::JUMP_COND) {
-      MachineInstr *predSet = I;
-      while (!isPredicateSetter(predSet->getOpcode())) {
-        predSet = --I;
-      }
-      TBB = LastInst->getOperand(0).getMBB();
-      Cond.push_back(predSet->getOperand(1));
-      Cond.push_back(predSet->getOperand(2));
-      Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
-      return false;
-    }
-    return true;  // Can't handle indirect branch.
-  }
-
-  // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
-  unsigned SecondLastOpc = SecondLastInst->getOpcode();
-
-  // If the block ends with a B and a Bcc, handle it.
-  if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) {
-    MachineInstr *predSet = --I;
-    while (!isPredicateSetter(predSet->getOpcode())) {
-      predSet = --I;
-    }
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    FBB = LastInst->getOperand(0).getMBB();
-    Cond.push_back(predSet->getOperand(1));
-    Cond.push_back(predSet->getOperand(2));
-    Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
-    return false;
-  }
-
-  // Otherwise, can't handle this.
-  return true;
-}
-
-static
-MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
-  for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
-      It != E; ++It) {
-    if (It->getOpcode() == AMDGPU::CF_ALU ||
-        It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
-      return std::prev(It.base());
-  }
-  return MBB.end();
-}
-
-unsigned
-R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
-                            MachineBasicBlock *TBB,
-                            MachineBasicBlock *FBB,
-                            ArrayRef<MachineOperand> Cond,
-                            DebugLoc DL) const {
-  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
-
-  if (!FBB) {
-    if (Cond.empty()) {
-      BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
-      return 1;
-    } else {
-      MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
-      assert(PredSet && "No previous predicate !");
-      addFlag(PredSet, 0, MO_FLAG_PUSH);
-      PredSet->getOperand(2).setImm(Cond[1].getImm());
-
-      BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
-             .addMBB(TBB)
-             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
-      MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
-      if (CfAlu == MBB.end())
-        return 1;
-      assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
-      CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
-      return 1;
-    }
-  } else {
-    MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
-    assert(PredSet && "No previous predicate !");
-    addFlag(PredSet, 0, MO_FLAG_PUSH);
-    PredSet->getOperand(2).setImm(Cond[1].getImm());
-    BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
-            .addMBB(TBB)
-            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
-    BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
-    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
-    if (CfAlu == MBB.end())
-      return 2;
-    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
-    CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
-    return 2;
-  }
-}
-
-unsigned
-R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
-
-  // Note : we leave PRED* instructions there.
-  // They may be needed when predicating instructions.
-
-  MachineBasicBlock::iterator I = MBB.end();
-
-  if (I == MBB.begin()) {
-    return 0;
-  }
-  --I;
-  switch (I->getOpcode()) {
-  default:
-    return 0;
-  case AMDGPU::JUMP_COND: {
-    MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
-    clearFlag(predSet, 0, MO_FLAG_PUSH);
-    I->eraseFromParent();
-    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
-    if (CfAlu == MBB.end())
-      break;
-    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
-    CfAlu->setDesc(get(AMDGPU::CF_ALU));
-    break;
-  }
-  case AMDGPU::JUMP:
-    I->eraseFromParent();
-    break;
-  }
-  I = MBB.end();
-
-  if (I == MBB.begin()) {
-    return 1;
-  }
-  --I;
-  switch (I->getOpcode()) {
-    // FIXME: only one case??
-  default:
-    return 1;
-  case AMDGPU::JUMP_COND: {
-    MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
-    clearFlag(predSet, 0, MO_FLAG_PUSH);
-    I->eraseFromParent();
-    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
-    if (CfAlu == MBB.end())
-      break;
-    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
-    CfAlu->setDesc(get(AMDGPU::CF_ALU));
-    break;
-  }
-  case AMDGPU::JUMP:
-    I->eraseFromParent();
-    break;
-  }
-  return 2;
-}
-
-bool
-R600InstrInfo::isPredicated(const MachineInstr *MI) const {
-  int idx = MI->findFirstPredOperandIdx();
-  if (idx < 0)
-    return false;
-
-  unsigned Reg = MI->getOperand(idx).getReg();
-  switch (Reg) {
-  default: return false;
-  case AMDGPU::PRED_SEL_ONE:
-  case AMDGPU::PRED_SEL_ZERO:
-  case AMDGPU::PREDICATE_BIT:
-    return true;
-  }
-}
-
-bool
-R600InstrInfo::isPredicable(MachineInstr *MI) const {
-  // XXX: KILL* instructions can be predicated, but they must be the last
-  // instruction in a clause, so this means any instructions after them cannot
-  // be predicated.  Until we have proper support for instruction clauses in the
-  // backend, we will mark KILL* instructions as unpredicable.
-
-  if (MI->getOpcode() == AMDGPU::KILLGT) {
-    return false;
-  } else if (MI->getOpcode() == AMDGPU::CF_ALU) {
-    // If the clause start in the middle of MBB then the MBB has more
-    // than a single clause, unable to predicate several clauses.
-    if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI))
-      return false;
-    // TODO: We don't support KC merging atm
-    if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0)
-      return false;
-    return true;
-  } else if (isVector(*MI)) {
-    return false;
-  } else {
-    return AMDGPUInstrInfo::isPredicable(MI);
-  }
-}
-
-
-bool
-R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
-                                   unsigned NumCyles,
-                                   unsigned ExtraPredCycles,
-                                   const BranchProbability &Probability) const{
-  return true;
-}
-
-bool
-R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
-                                   unsigned NumTCycles,
-                                   unsigned ExtraTCycles,
-                                   MachineBasicBlock &FMBB,
-                                   unsigned NumFCycles,
-                                   unsigned ExtraFCycles,
-                                   const BranchProbability &Probability) const {
-  return true;
-}
-
-bool
-R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
-                                         unsigned NumCyles,
-                                         const BranchProbability &Probability)
-                                         const {
-  return true;
-}
-
-bool
-R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
-                                         MachineBasicBlock &FMBB) const {
-  return false;
-}
-
-
-bool
-R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
-  MachineOperand &MO = Cond[1];
-  switch (MO.getImm()) {
-  case OPCODE_IS_ZERO_INT:
-    MO.setImm(OPCODE_IS_NOT_ZERO_INT);
-    break;
-  case OPCODE_IS_NOT_ZERO_INT:
-    MO.setImm(OPCODE_IS_ZERO_INT);
-    break;
-  case OPCODE_IS_ZERO:
-    MO.setImm(OPCODE_IS_NOT_ZERO);
-    break;
-  case OPCODE_IS_NOT_ZERO:
-    MO.setImm(OPCODE_IS_ZERO);
-    break;
-  default:
-    return true;
-  }
-
-  MachineOperand &MO2 = Cond[2];
-  switch (MO2.getReg()) {
-  case AMDGPU::PRED_SEL_ZERO:
-    MO2.setReg(AMDGPU::PRED_SEL_ONE);
-    break;
-  case AMDGPU::PRED_SEL_ONE:
-    MO2.setReg(AMDGPU::PRED_SEL_ZERO);
-    break;
-  default:
-    return true;
-  }
-  return false;
-}
-
-bool
-R600InstrInfo::DefinesPredicate(MachineInstr *MI,
-                                std::vector<MachineOperand> &Pred) const {
-  return isPredicateSetter(MI->getOpcode());
-}
-
-
-bool
-R600InstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
-                                 ArrayRef<MachineOperand> Pred2) const {
-  return false;
-}
-
-
-bool
-R600InstrInfo::PredicateInstruction(MachineInstr *MI,
-                                    ArrayRef<MachineOperand> Pred) const {
-  int PIdx = MI->findFirstPredOperandIdx();
-
-  if (MI->getOpcode() == AMDGPU::CF_ALU) {
-    MI->getOperand(8).setImm(0);
-    return true;
-  }
-
-  if (MI->getOpcode() == AMDGPU::DOT_4) {
-    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X))
-        .setReg(Pred[2].getReg());
-    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y))
-        .setReg(Pred[2].getReg());
-    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z))
-        .setReg(Pred[2].getReg());
-    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W))
-        .setReg(Pred[2].getReg());
-    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
-    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
-    return true;
-  }
-
-  if (PIdx != -1) {
-    MachineOperand &PMO = MI->getOperand(PIdx);
-    PMO.setReg(Pred[2].getReg());
-    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
-    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
-    return true;
-  }
-
-  return false;
-}
-
-unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const {
-  return 2;
-}
-
-unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
-                                            const MachineInstr *MI,
-                                            unsigned *PredCost) const {
-  if (PredCost)
-    *PredCost = 2;
-  return 2;
-}
-
-bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-
-  switch(MI->getOpcode()) {
-  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
-  case AMDGPU::R600_EXTRACT_ELT_V2:
-  case AMDGPU::R600_EXTRACT_ELT_V4:
-    buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(),
-                      RI.getHWRegIndex(MI->getOperand(1).getReg()), //  Address
-                      MI->getOperand(2).getReg(),
-                      RI.getHWRegChan(MI->getOperand(1).getReg()));
-    break;
-  case AMDGPU::R600_INSERT_ELT_V2:
-  case AMDGPU::R600_INSERT_ELT_V4:
-    buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value
-                       RI.getHWRegIndex(MI->getOperand(1).getReg()),  // Address
-                       MI->getOperand(3).getReg(),                    // Offset
-                       RI.getHWRegChan(MI->getOperand(1).getReg()));  // Channel
-    break;
-  }
-  MI->eraseFromParent();
-  return true;
-}
-
-void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
-                                             const MachineFunction &MF) const {
-  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
-      MF.getSubtarget().getFrameLowering());
-
-  unsigned StackWidth = TFL->getStackWidth(MF);
-  int End = getIndirectIndexEnd(MF);
-
-  if (End == -1)
-    return;
-
-  for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
-    unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
-    Reserved.set(SuperReg);
-    for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
-      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
-      Reserved.set(Reg);
-    }
-  }
-}
-
-unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
-                                                 unsigned Channel) const {
-  // XXX: Remove when we support a stack width > 2
-  assert(Channel == 0);
-  return RegIndex;
-}
-
-const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const {
-  return &AMDGPU::R600_TReg32_XRegClass;
-}
-
-MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
-                                       MachineBasicBlock::iterator I,
-                                       unsigned ValueReg, unsigned Address,
-                                       unsigned OffsetReg) const {
-  return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0);
-}
-
-MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
-                                       MachineBasicBlock::iterator I,
-                                       unsigned ValueReg, unsigned Address,
-                                       unsigned OffsetReg,
-                                       unsigned AddrChan) const {
-  unsigned AddrReg;
-  switch (AddrChan) {
-    default: llvm_unreachable("Invalid Channel");
-    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
-    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
-    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
-    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
-  }
-  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
-                                               AMDGPU::AR_X, OffsetReg);
-  setImmOperand(MOVA, AMDGPU::OpName::write, 0);
-
-  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
-                                      AddrReg, ValueReg)
-                                      .addReg(AMDGPU::AR_X,
-                                           RegState::Implicit | RegState::Kill);
-  setImmOperand(Mov, AMDGPU::OpName::dst_rel, 1);
-  return Mov;
-}
-
-MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
-                                       MachineBasicBlock::iterator I,
-                                       unsigned ValueReg, unsigned Address,
-                                       unsigned OffsetReg) const {
-  return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0);
-}
-
-MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
-                                       MachineBasicBlock::iterator I,
-                                       unsigned ValueReg, unsigned Address,
-                                       unsigned OffsetReg,
-                                       unsigned AddrChan) const {
-  unsigned AddrReg;
-  switch (AddrChan) {
-    default: llvm_unreachable("Invalid Channel");
-    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
-    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
-    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
-    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
-  }
-  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
-                                                       AMDGPU::AR_X,
-                                                       OffsetReg);
-  setImmOperand(MOVA, AMDGPU::OpName::write, 0);
-  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
-                                      ValueReg,
-                                      AddrReg)
-                                      .addReg(AMDGPU::AR_X,
-                                           RegState::Implicit | RegState::Kill);
-  setImmOperand(Mov, AMDGPU::OpName::src0_rel, 1);
-
-  return Mov;
-}
-
-unsigned R600InstrInfo::getMaxAlusPerClause() const {
-  return 115;
-}
-
-MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB,
-                                                  MachineBasicBlock::iterator I,
-                                                  unsigned Opcode,
-                                                  unsigned DstReg,
-                                                  unsigned Src0Reg,
-                                                  unsigned Src1Reg) const {
-  MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode),
-    DstReg);           // $dst
-
-  if (Src1Reg) {
-    MIB.addImm(0)     // $update_exec_mask
-       .addImm(0);    // $update_predicate
-  }
-  MIB.addImm(1)        // $write
-     .addImm(0)        // $omod
-     .addImm(0)        // $dst_rel
-     .addImm(0)        // $dst_clamp
-     .addReg(Src0Reg)  // $src0
-     .addImm(0)        // $src0_neg
-     .addImm(0)        // $src0_rel
-     .addImm(0)        // $src0_abs
-     .addImm(-1);       // $src0_sel
-
-  if (Src1Reg) {
-    MIB.addReg(Src1Reg) // $src1
-       .addImm(0)       // $src1_neg
-       .addImm(0)       // $src1_rel
-       .addImm(0)       // $src1_abs
-       .addImm(-1);      // $src1_sel
-  }
-
-  //XXX: The r600g finalizer expects this to be 1, once we've moved the
-  //scheduling to the backend, we can change the default to 0.
-  MIB.addImm(1)        // $last
-      .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
-      .addImm(0)         // $literal
-      .addImm(0);        // $bank_swizzle
-
-  return MIB;
-}
-
-#define OPERAND_CASE(Label) \
-  case Label: { \
-    static const unsigned Ops[] = \
-    { \
-      Label##_X, \
-      Label##_Y, \
-      Label##_Z, \
-      Label##_W \
-    }; \
-    return Ops[Slot]; \
-  }
-
-static unsigned getSlotedOps(unsigned  Op, unsigned Slot) {
-  switch (Op) {
-  OPERAND_CASE(AMDGPU::OpName::update_exec_mask)
-  OPERAND_CASE(AMDGPU::OpName::update_pred)
-  OPERAND_CASE(AMDGPU::OpName::write)
-  OPERAND_CASE(AMDGPU::OpName::omod)
-  OPERAND_CASE(AMDGPU::OpName::dst_rel)
-  OPERAND_CASE(AMDGPU::OpName::clamp)
-  OPERAND_CASE(AMDGPU::OpName::src0)
-  OPERAND_CASE(AMDGPU::OpName::src0_neg)
-  OPERAND_CASE(AMDGPU::OpName::src0_rel)
-  OPERAND_CASE(AMDGPU::OpName::src0_abs)
-  OPERAND_CASE(AMDGPU::OpName::src0_sel)
-  OPERAND_CASE(AMDGPU::OpName::src1)
-  OPERAND_CASE(AMDGPU::OpName::src1_neg)
-  OPERAND_CASE(AMDGPU::OpName::src1_rel)
-  OPERAND_CASE(AMDGPU::OpName::src1_abs)
-  OPERAND_CASE(AMDGPU::OpName::src1_sel)
-  OPERAND_CASE(AMDGPU::OpName::pred_sel)
-  default:
-    llvm_unreachable("Wrong Operand");
-  }
-}
-
-#undef OPERAND_CASE
-
-MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
-    MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg)
-    const {
-  assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
-  unsigned Opcode;
-  if (ST.getGeneration() <= AMDGPUSubtarget::R700)
-    Opcode = AMDGPU::DOT4_r600;
-  else
-    Opcode = AMDGPU::DOT4_eg;
-  MachineBasicBlock::iterator I = MI;
-  MachineOperand &Src0 = MI->getOperand(
-      getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot)));
-  MachineOperand &Src1 = MI->getOperand(
-      getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot)));
-  MachineInstr *MIB = buildDefaultInstruction(
-      MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
-  static const unsigned  Operands[14] = {
-    AMDGPU::OpName::update_exec_mask,
-    AMDGPU::OpName::update_pred,
-    AMDGPU::OpName::write,
-    AMDGPU::OpName::omod,
-    AMDGPU::OpName::dst_rel,
-    AMDGPU::OpName::clamp,
-    AMDGPU::OpName::src0_neg,
-    AMDGPU::OpName::src0_rel,
-    AMDGPU::OpName::src0_abs,
-    AMDGPU::OpName::src0_sel,
-    AMDGPU::OpName::src1_neg,
-    AMDGPU::OpName::src1_rel,
-    AMDGPU::OpName::src1_abs,
-    AMDGPU::OpName::src1_sel,
-  };
-
-  MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
-      getSlotedOps(AMDGPU::OpName::pred_sel, Slot)));
-  MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel))
-      .setReg(MO.getReg());
-
-  for (unsigned i = 0; i < 14; i++) {
-    MachineOperand &MO = MI->getOperand(
-        getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot)));
-    assert (MO.isImm());
-    setImmOperand(MIB, Operands[i], MO.getImm());
-  }
-  MIB->getOperand(20).setImm(0);
-  return MIB;
-}
-
-MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
-                                         MachineBasicBlock::iterator I,
-                                         unsigned DstReg,
-                                         uint64_t Imm) const {
-  MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
-                                                  AMDGPU::ALU_LITERAL_X);
-  setImmOperand(MovImm, AMDGPU::OpName::literal, Imm);
-  return MovImm;
-}
-
-MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB,
-                                       MachineBasicBlock::iterator I,
-                                       unsigned DstReg, unsigned SrcReg) const {
-  return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg);
-}
-
-int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
-  return getOperandIdx(MI.getOpcode(), Op);
-}
-
-int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const {
-  return AMDGPU::getNamedOperandIdx(Opcode, Op);
-}
-
-void R600InstrInfo::setImmOperand(MachineInstr *MI, unsigned Op,
-                                  int64_t Imm) const {
-  int Idx = getOperandIdx(*MI, Op);
-  assert(Idx != -1 && "Operand not supported for this instruction.");
-  assert(MI->getOperand(Idx).isImm());
-  MI->getOperand(Idx).setImm(Imm);
-}
-
-//===----------------------------------------------------------------------===//
-// Instruction flag getters/setters
-//===----------------------------------------------------------------------===//
-
-bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const {
-  return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0;
-}
-
-MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
-                                         unsigned Flag) const {
-  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
-  int FlagIndex = 0;
-  if (Flag != 0) {
-    // If we pass something other than the default value of Flag to this
-    // function, it means we are want to set a flag on an instruction
-    // that uses native encoding.
-    assert(HAS_NATIVE_OPERANDS(TargetFlags));
-    bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
-    switch (Flag) {
-    case MO_FLAG_CLAMP:
-      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::clamp);
-      break;
-    case MO_FLAG_MASK:
-      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::write);
-      break;
-    case MO_FLAG_NOT_LAST:
-    case MO_FLAG_LAST:
-      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::last);
-      break;
-    case MO_FLAG_NEG:
-      switch (SrcIdx) {
-      case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_neg); break;
-      case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_neg); break;
-      case 2: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src2_neg); break;
-      }
-      break;
-
-    case MO_FLAG_ABS:
-      assert(!IsOP3 && "Cannot set absolute value modifier for OP3 "
-                       "instructions.");
-      (void)IsOP3;
-      switch (SrcIdx) {
-      case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_abs); break;
-      case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_abs); break;
-      }
-      break;
-
-    default:
-      FlagIndex = -1;
-      break;
-    }
-    assert(FlagIndex != -1 && "Flag not supported for this instruction");
-  } else {
-      FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags);
-      assert(FlagIndex != 0 &&
-         "Instruction flags not supported for this instruction");
-  }
-
-  MachineOperand &FlagOp = MI->getOperand(FlagIndex);
-  assert(FlagOp.isImm());
-  return FlagOp;
-}
-
-void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand,
-                            unsigned Flag) const {
-  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
-  if (Flag == 0) {
-    return;
-  }
-  if (HAS_NATIVE_OPERANDS(TargetFlags)) {
-    MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
-    if (Flag == MO_FLAG_NOT_LAST) {
-      clearFlag(MI, Operand, MO_FLAG_LAST);
-    } else if (Flag == MO_FLAG_MASK) {
-      clearFlag(MI, Operand, Flag);
-    } else {
-      FlagOp.setImm(1);
-    }
-  } else {
-      MachineOperand &FlagOp = getFlagOp(MI, Operand);
-      FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand)));
-  }
-}
-
-void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand,
-                              unsigned Flag) const {
-  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
-  if (HAS_NATIVE_OPERANDS(TargetFlags)) {
-    MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
-    FlagOp.setImm(0);
-  } else {
-    MachineOperand &FlagOp = getFlagOp(MI);
-    unsigned InstFlags = FlagOp.getImm();
-    InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand));
-    FlagOp.setImm(InstFlags);
-  }
-}
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
deleted file mode 100644
index dee4c2b9ae3..00000000000
--- a/lib/Target/R600/R600InstrInfo.h
+++ /dev/null
@@ -1,303 +0,0 @@
-//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Interface definition for R600InstrInfo
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H
-#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H
-
-#include "AMDGPUInstrInfo.h"
-#include "R600Defines.h"
-#include "R600RegisterInfo.h"
-#include <map>
-
-namespace llvm {
-
-  class AMDGPUTargetMachine;
-  class DFAPacketizer;
-  class ScheduleDAG;
-  class MachineFunction;
-  class MachineInstr;
-  class MachineInstrBuilder;
-
-  class R600InstrInfo : public AMDGPUInstrInfo {
-  private:
-  const R600RegisterInfo RI;
-
-  std::vector<std::pair<int, unsigned> >
-  ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const;
-
-
-  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
-                                        MachineBasicBlock::iterator I,
-                                        unsigned ValueReg, unsigned Address,
-                                        unsigned OffsetReg,
-                                        unsigned AddrChan) const;
-
-  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                        MachineBasicBlock::iterator I,
-                                        unsigned ValueReg, unsigned Address,
-                                        unsigned OffsetReg,
-                                        unsigned AddrChan) const;
-  public:
-  enum BankSwizzle {
-    ALU_VEC_012_SCL_210 = 0,
-    ALU_VEC_021_SCL_122,
-    ALU_VEC_120_SCL_212,
-    ALU_VEC_102_SCL_221,
-    ALU_VEC_201,
-    ALU_VEC_210
-  };
-
-  explicit R600InstrInfo(const AMDGPUSubtarget &st);
-
-  const R600RegisterInfo &getRegisterInfo() const override;
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator MI, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
-                   bool KillSrc) const override;
-  bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI) const override;
-
-  bool isTrig(const MachineInstr &MI) const;
-  bool isPlaceHolderOpcode(unsigned opcode) const;
-  bool isReductionOp(unsigned opcode) const;
-  bool isCubeOp(unsigned opcode) const;
-
-  /// \returns true if this \p Opcode represents an ALU instruction.
-  bool isALUInstr(unsigned Opcode) const;
-  bool hasInstrModifiers(unsigned Opcode) const;
-  bool isLDSInstr(unsigned Opcode) const;
-  bool isLDSNoRetInstr(unsigned Opcode) const;
-  bool isLDSRetInstr(unsigned Opcode) const;
-
-  /// \returns true if this \p Opcode represents an ALU instruction or an
-  /// instruction that will be lowered in ExpandSpecialInstrs Pass.
-  bool canBeConsideredALU(const MachineInstr *MI) const;
-
-  bool isTransOnly(unsigned Opcode) const;
-  bool isTransOnly(const MachineInstr *MI) const;
-  bool isVectorOnly(unsigned Opcode) const;
-  bool isVectorOnly(const MachineInstr *MI) const;
-  bool isExport(unsigned Opcode) const;
-
-  bool usesVertexCache(unsigned Opcode) const;
-  bool usesVertexCache(const MachineInstr *MI) const;
-  bool usesTextureCache(unsigned Opcode) const;
-  bool usesTextureCache(const MachineInstr *MI) const;
-
-  bool mustBeLastInClause(unsigned Opcode) const;
-  bool usesAddressRegister(MachineInstr *MI) const;
-  bool definesAddressRegister(MachineInstr *MI) const;
-  bool readsLDSSrcReg(const MachineInstr *MI) const;
-
-  /// \returns The operand index for the given source number.  Legal values
-  /// for SrcNum are 0, 1, and 2.
-  int getSrcIdx(unsigned Opcode, unsigned SrcNum) const;
-  /// \returns The operand Index for the Sel operand given an index to one
-  /// of the instruction's src operands.
-  int getSelIdx(unsigned Opcode, unsigned SrcIdx) const;
-
-  /// \returns a pair for each src of an ALU instructions.
-  /// The first member of a pair is the register id.
-  /// If register is ALU_CONST, second member is SEL.
-  /// If register is ALU_LITERAL, second member is IMM.
-  /// Otherwise, second member value is undefined.
-  SmallVector<std::pair<MachineOperand *, int64_t>, 3>
-      getSrcs(MachineInstr *MI) const;
-
-  unsigned  isLegalUpTo(
-    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
-    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
-    const std::vector<std::pair<int, unsigned> > &TransSrcs,
-    R600InstrInfo::BankSwizzle TransSwz) const;
-
-  bool FindSwizzleForVectorSlot(
-    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
-    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
-    const std::vector<std::pair<int, unsigned> > &TransSrcs,
-    R600InstrInfo::BankSwizzle TransSwz) const;
-
-  /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210
-  /// returns true and the first (in lexical order) BankSwizzle affectation
-  /// starting from the one already provided in the Instruction Group MIs that
-  /// fits Read Port limitations in BS if available. Otherwise returns false
-  /// and undefined content in BS.
-  /// isLastAluTrans should be set if the last Alu of MIs will be executed on
-  /// Trans ALU. In this case, ValidTSwizzle returns the BankSwizzle value to
-  /// apply to the last instruction.
-  /// PV holds GPR to PV registers in the Instruction Group MIs.
-  bool fitsReadPortLimitations(const std::vector<MachineInstr *> &MIs,
-                               const DenseMap<unsigned, unsigned> &PV,
-                               std::vector<BankSwizzle> &BS,
-                               bool isLastAluTrans) const;
-
-  /// An instruction group can only access 2 channel pair (either [XY] or [ZW])
-  /// from KCache bank on R700+. This function check if MI set in input meet
-  /// this limitations
-  bool fitsConstReadLimitations(const std::vector<MachineInstr *> &) const;
-  /// Same but using const index set instead of MI set.
-  bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
-
-  /// \brief Vector instructions are instructions that must fill all
-  /// instruction slots within an instruction group.
-  bool isVector(const MachineInstr &MI) const;
-
-  bool isMov(unsigned Opcode) const override;
-
-  DFAPacketizer *
-  CreateTargetScheduleState(const TargetSubtargetInfo &) const override;
-
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
-
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
-                     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
-
-  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
-
-  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
-
-  bool isPredicated(const MachineInstr *MI) const override;
-
-  bool isPredicable(MachineInstr *MI) const override;
-
-  bool
-   isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
-                             const BranchProbability &Probability) const override;
-
-  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
-                           unsigned ExtraPredCycles,
-                           const BranchProbability &Probability) const override ;
-
-  bool
-   isProfitableToIfCvt(MachineBasicBlock &TMBB,
-                       unsigned NumTCycles, unsigned ExtraTCycles,
-                       MachineBasicBlock &FMBB,
-                       unsigned NumFCycles, unsigned ExtraFCycles,
-                       const BranchProbability &Probability) const override;
-
-  bool DefinesPredicate(MachineInstr *MI,
-                                  std::vector<MachineOperand> &Pred) const override;
-
-  bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
-                         ArrayRef<MachineOperand> Pred2) const override;
-
-  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
-                                          MachineBasicBlock &FMBB) const override;
-
-  bool PredicateInstruction(MachineInstr *MI,
-                            ArrayRef<MachineOperand> Pred) const override;
-
-  unsigned int getPredicationCost(const MachineInstr *) const override;
-
-  unsigned int getInstrLatency(const InstrItineraryData *ItinData,
-                               const MachineInstr *MI,
-                               unsigned *PredCost = nullptr) const override;
-
-  int getInstrLatency(const InstrItineraryData *ItinData,
-                      SDNode *Node) const override { return 1;}
-
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
-
-  /// \brief Reserve the registers that may be accesed using indirect addressing.
-  void reserveIndirectRegisters(BitVector &Reserved,
-                                const MachineFunction &MF) const;
-
-  unsigned calculateIndirectAddress(unsigned RegIndex,
-                                    unsigned Channel) const override;
-
-  const TargetRegisterClass *getIndirectAddrRegClass() const override;
-
-  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                          MachineBasicBlock::iterator I,
-                          unsigned ValueReg, unsigned Address,
-                          unsigned OffsetReg) const override;
-
-  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
-                                        MachineBasicBlock::iterator I,
-                                        unsigned ValueReg, unsigned Address,
-                                        unsigned OffsetReg) const override;
-
-  unsigned getMaxAlusPerClause() const;
-
-  ///buildDefaultInstruction - This function returns a MachineInstr with
-  /// all the instruction modifiers initialized to their default values.
-  /// You can use this function to avoid manually specifying each instruction
-  /// modifier operand when building a new instruction.
-  ///
-  /// \returns a MachineInstr with all the instruction modifiers initialized
-  /// to their default values.
-  MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB,
-                                              MachineBasicBlock::iterator I,
-                                              unsigned Opcode,
-                                              unsigned DstReg,
-                                              unsigned Src0Reg,
-                                              unsigned Src1Reg = 0) const;
-
-  MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB,
-                                             MachineInstr *MI,
-                                             unsigned Slot,
-                                             unsigned DstReg) const;
-
-  MachineInstr *buildMovImm(MachineBasicBlock &BB,
-                                  MachineBasicBlock::iterator I,
-                                  unsigned DstReg,
-                                  uint64_t Imm) const;
-
-  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
-                              MachineBasicBlock::iterator I,
-                              unsigned DstReg, unsigned SrcReg) const override;
-
-  /// \brief Get the index of Op in the MachineInstr.
-  ///
-  /// \returns -1 if the Instruction does not contain the specified \p Op.
-  int getOperandIdx(const MachineInstr &MI, unsigned Op) const;
-
-  /// \brief Get the index of \p Op for the given Opcode.
-  ///
-  /// \returns -1 if the Instruction does not contain the specified \p Op.
-  int getOperandIdx(unsigned Opcode, unsigned Op) const;
-
-  /// \brief Helper function for setting instruction flag values.
-  void setImmOperand(MachineInstr *MI, unsigned Op, int64_t Imm) const;
-
-  /// \returns true if this instruction has an operand for storing target flags.
-  bool hasFlagOperand(const MachineInstr &MI) const;
-
-  ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
-  void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
-
-  ///\brief Determine if the specified \p Flag is set on this \p Operand.
-  bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
-
-  /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2)
-  /// \param Flag The flag being set.
-  ///
-  /// \returns the operand containing the flags for this instruction.
-  MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0,
-                            unsigned Flag = 0) const;
-
-  /// \brief Clear the specified flag on the instruction.
-  void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
-};
-
-namespace AMDGPU {
-
-int getLDSNoRetOp(uint16_t Opcode);
-
-} //End namespace AMDGPU
-
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
deleted file mode 100644
index 7beed092b3f..00000000000
--- a/lib/Target/R600/R600Instructions.td
+++ /dev/null
@@ -1,1744 +0,0 @@
-//===-- R600Instructions.td - R600 Instruction defs  -------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TableGen definitions for instructions which are available on R600 family
-// GPUs.
-//
-//===----------------------------------------------------------------------===//
-
-include "R600Intrinsics.td"
-include "R600InstrFormats.td"
-
-class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstR600 <outs, ins, asm, pattern, NullALU> {
-
-  let Namespace = "AMDGPU";
-}
-
-def MEMxi : Operand<iPTR> {
-  let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index);
-  let PrintMethod = "printMemOperand";
-}
-
-def MEMrr : Operand<iPTR> {
-  let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index);
-}
-
-// Operands for non-registers
-
-class InstFlag<string PM = "printOperand", int Default = 0>
-    : OperandWithDefaultOps <i32, (ops (i32 Default))> {
-  let PrintMethod = PM;
-}
-
-// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers
-def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> {
-  let PrintMethod = "printSel";
-}
-def BANK_SWIZZLE : OperandWithDefaultOps <i32, (ops (i32 0))> {
-  let PrintMethod = "printBankSwizzle";
-}
-
-def LITERAL : InstFlag<"printLiteral">;
-
-def WRITE : InstFlag <"printWrite", 1>;
-def OMOD : InstFlag <"printOMOD">;
-def REL : InstFlag <"printRel">;
-def CLAMP : InstFlag <"printClamp">;
-def NEG : InstFlag <"printNeg">;
-def ABS : InstFlag <"printAbs">;
-def UEM : InstFlag <"printUpdateExecMask">;
-def UP : InstFlag <"printUpdatePred">;
-
-// XXX: The r600g finalizer in Mesa expects last to be one in most cases.
-// Once we start using the packetizer in this backend we should have this
-// default to 0.
-def LAST : InstFlag<"printLast", 1>;
-def RSel : Operand<i32> {
-  let PrintMethod = "printRSel";
-}
-def CT: Operand<i32> {
-  let PrintMethod = "printCT";
-}
-
-def FRAMEri : Operand<iPTR> {
-  let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index);
-}
-
-def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>;
-def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
-def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
-def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
-def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
-
-
-def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
-                                     (ops PRED_SEL_OFF)>;
-
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-
-// Class for instructions with only one source register.
-// If you add new ins to this instruction, make sure they are listed before
-// $literal, because the backend currently assumes that the last operand is
-// a literal.  Also be sure to update the enum R600Op1OperandIndex::ROI in
-// R600Defines.h, R600InstrInfo::buildDefaultInstruction(),
-// and R600InstrInfo::getOperandIdx().
-class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
-                InstrItinClass itin = AnyALU> :
-    InstR600 <(outs R600_Reg32:$dst),
-              (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
-                   R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
-                   LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
-                   BANK_SWIZZLE:$bank_swizzle),
-              !strconcat("  ", opName,
-                   "$clamp $last $dst$write$dst_rel$omod, "
-                   "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
-                   "$pred_sel $bank_swizzle"),
-              pattern,
-              itin>,
-    R600ALU_Word0,
-    R600ALU_Word1_OP2 <inst> {
-
-  let src1 = 0;
-  let src1_rel = 0;
-  let src1_neg = 0;
-  let src1_abs = 0;
-  let update_exec_mask = 0;
-  let update_pred = 0;
-  let HasNativeOperands = 1;
-  let Op1 = 1;
-  let ALUInst = 1;
-  let DisableEncoding = "$literal";
-  let UseNamedOperandTable = 1;
-
-  let Inst{31-0}  = Word0;
-  let Inst{63-32} = Word1;
-}
-
-class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
-                    InstrItinClass itin = AnyALU> :
-    R600_1OP <inst, opName,
-              [(set R600_Reg32:$dst, (node R600_Reg32:$src0))], itin
->;
-
-// If you add or change the operands for R600_2OP instructions, you must
-// also update the R600Op2OperandIndex::ROI enum in R600Defines.h,
-// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx().
-class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
-                InstrItinClass itin = AnyALU> :
-  InstR600 <(outs R600_Reg32:$dst),
-          (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
-               OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
-               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
-               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel,
-               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
-               BANK_SWIZZLE:$bank_swizzle),
-          !strconcat("  ", opName,
-                "$clamp $last $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
-                "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
-                "$src1_neg$src1_abs$src1$src1_abs$src1_rel, "
-                "$pred_sel $bank_swizzle"),
-          pattern,
-          itin>,
-    R600ALU_Word0,
-    R600ALU_Word1_OP2 <inst> {
-
-  let HasNativeOperands = 1;
-  let Op2 = 1;
-  let ALUInst = 1;
-  let DisableEncoding = "$literal";
-  let UseNamedOperandTable = 1;
-
-  let Inst{31-0}  = Word0;
-  let Inst{63-32} = Word1;
-}
-
-class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
-                       InstrItinClass itin = AnyALU> :
-    R600_2OP <inst, opName,
-              [(set R600_Reg32:$dst, (node R600_Reg32:$src0,
-                                           R600_Reg32:$src1))], itin
->;
-
-// If you add our change the operands for R600_3OP instructions, you must
-// also update the R600Op3OperandIndex::ROI enum in R600Defines.h,
-// R600InstrInfo::buildDefaultInstruction(), and
-// R600InstrInfo::getOperandIdx().
-class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
-                InstrItinClass itin = AnyALU> :
-  InstR600 <(outs R600_Reg32:$dst),
-          (ins REL:$dst_rel, CLAMP:$clamp,
-               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel,
-               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel,
-               R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel,
-               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
-               BANK_SWIZZLE:$bank_swizzle),
-          !strconcat("  ", opName, "$clamp $last $dst$dst_rel, "
-                             "$src0_neg$src0$src0_rel, "
-                             "$src1_neg$src1$src1_rel, "
-                             "$src2_neg$src2$src2_rel, "
-                             "$pred_sel"
-                             "$bank_swizzle"),
-          pattern,
-          itin>,
-    R600ALU_Word0,
-    R600ALU_Word1_OP3<inst>{
-
-  let HasNativeOperands = 1;
-  let DisableEncoding = "$literal";
-  let Op3 = 1;
-  let UseNamedOperandTable = 1;
-  let ALUInst = 1;
-
-  let Inst{31-0}  = Word0;
-  let Inst{63-32} = Word1;
-}
-
-class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
-                      InstrItinClass itin = VecALU> :
-  InstR600 <(outs R600_Reg32:$dst),
-          ins,
-          asm,
-          pattern,
-          itin>;
-
-
-
-} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
-
-def TEX_SHADOW : PatLeaf<
-  (imm),
-  [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return (TType >= 6 && TType <= 8) || TType == 13;
-  }]
->;
-
-def TEX_RECT : PatLeaf<
-  (imm),
-  [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return TType == 5;
-  }]
->;
-
-def TEX_ARRAY : PatLeaf<
-  (imm),
-  [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return TType == 9 || TType == 10 || TType == 16;
-  }]
->;
-
-def TEX_SHADOW_ARRAY : PatLeaf<
-  (imm),
-  [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return TType == 11 || TType == 12 || TType == 17;
-  }]
->;
-
-def TEX_MSAA : PatLeaf<
-  (imm),
-  [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return TType == 14;
-  }]
->;
-
-def TEX_ARRAY_MSAA : PatLeaf<
-  (imm),
-  [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return TType == 15;
-  }]
->;
-
-class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
-                 dag outs, dag ins, string asm, list<dag> pattern> :
-    InstR600ISA <outs, ins, asm, pattern>,
-    CF_ALLOC_EXPORT_WORD0_RAT, CF_ALLOC_EXPORT_WORD1_BUF  {
-
-  let rat_id = ratid;
-  let rat_inst = ratinst;
-  let rim         = 0;
-  // XXX: Have a separate instruction for non-indexed writes.
-  let type        = 1;
-  let rw_rel      = 0;
-  let elem_size   = 0;
-
-  let array_size  = 0;
-  let comp_mask   = mask;
-  let burst_count = 0;
-  let vpm         = 0;
-  let cf_inst = cfinst;
-  let mark        = 0;
-  let barrier     = 1;
-
-  let Inst{31-0} = Word0;
-  let Inst{63-32} = Word1;
-  let IsExport = 1;
-
-}
-
-class VTX_READ <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
-    : InstR600ISA <outs, (ins MEMxi:$src_gpr), name, pattern>,
-      VTX_WORD1_GPR {
-
-  // Static fields
-  let DST_REL = 0;
-  // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL,
-  // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored,
-  // however, based on my testing if USE_CONST_FIELDS is set, then all
-  // these fields need to be set to 0.
-  let USE_CONST_FIELDS = 0;
-  let NUM_FORMAT_ALL = 1;
-  let FORMAT_COMP_ALL = 0;
-  let SRF_MODE_ALL = 0;
-
-  let Inst{63-32} = Word1;
-  // LLVM can only encode 64-bit instructions, so these fields are manually
-  // encoded in R600CodeEmitter
-  //
-  // bits<16> OFFSET;
-  // bits<2>  ENDIAN_SWAP = 0;
-  // bits<1>  CONST_BUF_NO_STRIDE = 0;
-  // bits<1>  MEGA_FETCH = 0;
-  // bits<1>  ALT_CONST = 0;
-  // bits<2>  BUFFER_INDEX_MODE = 0;
-
-  // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
-  // is done in R600CodeEmitter
-  //
-  // Inst{79-64} = OFFSET;
-  // Inst{81-80} = ENDIAN_SWAP;
-  // Inst{82}    = CONST_BUF_NO_STRIDE;
-  // Inst{83}    = MEGA_FETCH;
-  // Inst{84}    = ALT_CONST;
-  // Inst{86-85} = BUFFER_INDEX_MODE;
-  // Inst{95-86} = 0; Reserved
-
-  // VTX_WORD3 (Padding)
-  //
-  // Inst{127-96} = 0;
-
-  let VTXInst = 1;
-}
-
-class LoadParamFrag <PatFrag load_type> : PatFrag <
-  (ops node:$ptr), (load_type node:$ptr),
-  [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), 0); }]
->;
-
-def load_param : LoadParamFrag<load>;
-def load_param_exti8 : LoadParamFrag<az_extloadi8>;
-def load_param_exti16 : LoadParamFrag<az_extloadi16>;
-
-def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">;
-
-def isR600toCayman
-    : Predicate<
-          "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
-
-//===----------------------------------------------------------------------===//
-// R600 SDNodes
-//===----------------------------------------------------------------------===//
-
-def INTERP_PAIR_XY :  AMDGPUShaderInst <
-  (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1),
-  (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
-  "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1",
-  []>;
-
-def INTERP_PAIR_ZW :  AMDGPUShaderInst <
-  (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1),
-  (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
-  "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1",
-  []>;
-
-def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
-  SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
-  [SDNPVariadic]
->;
-
-def DOT4 : SDNode<"AMDGPUISD::DOT4",
-  SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>,
-      SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>,
-      SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>,
-  []
->;
-
-def COS_HW : SDNode<"AMDGPUISD::COS_HW",
-  SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>
->;
-
-def SIN_HW : SDNode<"AMDGPUISD::SIN_HW",
-  SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>
->;
-
-def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>;
-
-def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>;
-
-multiclass TexPattern<bits<32> TextureOp, Instruction inst, ValueType vt = v4f32> {
-def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
-          (i32 imm:$srcx), (i32 imm:$srcy), (i32 imm:$srcz), (i32 imm:$srcw),
-          (i32 imm:$offsetx), (i32 imm:$offsety), (i32 imm:$offsetz),
-          (i32 imm:$DST_SEL_X), (i32 imm:$DST_SEL_Y), (i32 imm:$DST_SEL_Z),
-          (i32 imm:$DST_SEL_W),
-          (i32 imm:$RESOURCE_ID), (i32 imm:$SAMPLER_ID),
-          (i32 imm:$COORD_TYPE_X), (i32 imm:$COORD_TYPE_Y), (i32 imm:$COORD_TYPE_Z),
-          (i32 imm:$COORD_TYPE_W)),
-          (inst R600_Reg128:$SRC_GPR,
-          imm:$srcx, imm:$srcy, imm:$srcz, imm:$srcw,
-          imm:$offsetx, imm:$offsety, imm:$offsetz,
-          imm:$DST_SEL_X, imm:$DST_SEL_Y, imm:$DST_SEL_Z,
-          imm:$DST_SEL_W,
-          imm:$RESOURCE_ID, imm:$SAMPLER_ID,
-          imm:$COORD_TYPE_X, imm:$COORD_TYPE_Y, imm:$COORD_TYPE_Z,
-          imm:$COORD_TYPE_W)>;
-}
-
-//===----------------------------------------------------------------------===//
-// Interpolation Instructions
-//===----------------------------------------------------------------------===//
-
-def INTERP_VEC_LOAD :  AMDGPUShaderInst <
-  (outs R600_Reg128:$dst),
-  (ins i32imm:$src0),
-  "INTERP_LOAD $src0 : $dst",
-  [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>;
-
-def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
-  let bank_swizzle = 5;
-}
-
-def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> {
-  let bank_swizzle = 5;
-}
-
-def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>;
-
-//===----------------------------------------------------------------------===//
-// Export Instructions
-//===----------------------------------------------------------------------===//
-
-def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
-
-def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType,
-  [SDNPHasChain, SDNPSideEffect]>;
-
-class ExportWord0 {
-  field bits<32> Word0;
-
-  bits<13> arraybase;
-  bits<2> type;
-  bits<7> gpr;
-  bits<2> elem_size;
-
-  let Word0{12-0} = arraybase;
-  let Word0{14-13} = type;
-  let Word0{21-15} = gpr;
-  let Word0{22} = 0; // RW_REL
-  let Word0{29-23} = 0; // INDEX_GPR
-  let Word0{31-30} = elem_size;
-}
-
-class ExportSwzWord1 {
-  field bits<32> Word1;
-
-  bits<3> sw_x;
-  bits<3> sw_y;
-  bits<3> sw_z;
-  bits<3> sw_w;
-  bits<1> eop;
-  bits<8> inst;
-
-  let Word1{2-0} = sw_x;
-  let Word1{5-3} = sw_y;
-  let Word1{8-6} = sw_z;
-  let Word1{11-9} = sw_w;
-}
-
-class ExportBufWord1 {
-  field bits<32> Word1;
-
-  bits<12> arraySize;
-  bits<4> compMask;
-  bits<1> eop;
-  bits<8> inst;
-
-  let Word1{11-0} = arraySize;
-  let Word1{15-12} = compMask;
-}
-
-multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
-  def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg),
-    (ExportInst
-        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0),
-        0, 61, 0, 7, 7, 7, cf_inst, 0)
-  >;
-
-  def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg),
-    (ExportInst
-        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0),
-        0, 61, 7, 0, 7, 7, cf_inst, 0)
-  >;
-
-  def : Pat<(int_R600_store_dummy (i32 imm:$type)),
-    (ExportInst
-        (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0)
-  >;
-
-  def : Pat<(int_R600_store_dummy 1),
-    (ExportInst
-        (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0)
-  >;
-
-  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
-    (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)),
-        (ExportInst R600_Reg128:$src, imm:$type, imm:$base,
-        imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0)
-  >;
-
-}
-
-multiclass SteamOutputExportPattern<Instruction ExportInst,
-    bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> {
-// Stream0
-  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
-      (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)),
-      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
-      4095, imm:$mask, buf0inst, 0)>;
-// Stream1
-  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
-      (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
-      (ExportInst $src, 0, imm:$arraybase,
-      4095, imm:$mask, buf1inst, 0)>;
-// Stream2
-  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
-      (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
-      (ExportInst $src, 0, imm:$arraybase,
-      4095, imm:$mask, buf2inst, 0)>;
-// Stream3
-  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
-      (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
-      (ExportInst $src, 0, imm:$arraybase,
-      4095, imm:$mask, buf3inst, 0)>;
-}
-
-// Export Instructions should not be duplicated by TailDuplication pass
-// (which assumes that duplicable instruction are affected by exec mask)
-let usesCustomInserter = 1, isNotDuplicable = 1 in {
-
-class ExportSwzInst : InstR600ISA<(
-    outs),
-    (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
-    RSel:$sw_x, RSel:$sw_y, RSel:$sw_z, RSel:$sw_w, i32imm:$inst,
-    i32imm:$eop),
-    !strconcat("EXPORT", " $gpr.$sw_x$sw_y$sw_z$sw_w"),
-    []>, ExportWord0, ExportSwzWord1 {
-  let elem_size = 3;
-  let Inst{31-0} = Word0;
-  let Inst{63-32} = Word1;
-  let IsExport = 1;
-}
-
-} // End usesCustomInserter = 1
-
-class ExportBufInst : InstR600ISA<(
-    outs),
-    (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
-    i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop),
-    !strconcat("EXPORT", " $gpr"),
-    []>, ExportWord0, ExportBufWord1 {
-  let elem_size = 0;
-  let Inst{31-0} = Word0;
-  let Inst{63-32} = Word1;
-  let IsExport = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// Control Flow Instructions
-//===----------------------------------------------------------------------===//
-
-
-def KCACHE : InstFlag<"printKCache">;
-
-class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs),
-(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1,
-KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1,
-i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1,
-i32imm:$COUNT, i32imm:$Enabled),
-!strconcat(OpName, " $COUNT, @$ADDR, "
-"KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"),
-[] >, CF_ALU_WORD0, CF_ALU_WORD1 {
-  field bits<64> Inst;
-
-  let CF_INST = inst;
-  let ALT_CONST = 0;
-  let WHOLE_QUAD_MODE = 0;
-  let BARRIER = 1;
-  let isCodeGenOnly = 1;
-  let UseNamedOperandTable = 1;
-
-  let Inst{31-0} = Word0;
-  let Inst{63-32} = Word1;
-}
-
-class CF_WORD0_R600 {
-  field bits<32> Word0;
-
-  bits<32> ADDR;
-
-  let Word0 = ADDR;
-}
-
-class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
-ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 {
-  field bits<64> Inst;
-  bits<4> CNT;
-
-  let CF_INST = inst;
-  let BARRIER = 1;
-  let CF_CONST = 0;
-  let VALID_PIXEL_MODE = 0;
-  let COND = 0;
-  let COUNT = CNT{2-0};
-  let CALL_COUNT = 0;
-  let COUNT_3 = CNT{3};
-  let END_OF_PROGRAM = 0;
-  let WHOLE_QUAD_MODE = 0;
-
-  let Inst{31-0} = Word0;
-  let Inst{63-32} = Word1;
-}
-
-class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
-ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG {
-  field bits<64> Inst;
-
-  let CF_INST = inst;
-  let BARRIER = 1;
-  let JUMPTABLE_SEL = 0;
-  let CF_CONST = 0;
-  let VALID_PIXEL_MODE = 0;
-  let COND = 0;
-  let END_OF_PROGRAM = 0;
-
-  let Inst{31-0} = Word0;
-  let Inst{63-32} = Word1;
-}
-
-def CF_ALU : ALU_CLAUSE<8, "ALU">;
-def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">;
-def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">;
-def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">;
-def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">;
-def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">;
-
-def FETCH_CLAUSE : AMDGPUInst <(outs),
-(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > {
-  field bits<8> Inst;
-  bits<8> num;
-  let Inst = num;
-  let isCodeGenOnly = 1;
-}
-
-def ALU_CLAUSE : AMDGPUInst <(outs),
-(ins i32imm:$addr), "ALU clause starting at $addr:", [] > {
-  field bits<8> Inst;
-  bits<8> num;
-  let Inst = num;
-  let isCodeGenOnly = 1;
-}
-
-def LITERALS : AMDGPUInst <(outs),
-(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > {
-  let isCodeGenOnly = 1;
-
-  field bits<64> Inst;
-  bits<32> literal1;
-  bits<32> literal2;
-
-  let Inst{31-0} = literal1;
-  let Inst{63-32} = literal2;
-}
-
-def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > {
-  field bits<64> Inst;
-}
-
-let Predicates = [isR600toCayman] in {
-
-//===----------------------------------------------------------------------===//
-// Common Instructions R600, R700, Evergreen, Cayman
-//===----------------------------------------------------------------------===//
-
-def ADD : R600_2OP_Helper <0x0, "ADD", fadd>;
-// Non-IEEE MUL: 0 * anything = 0
-def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>;
-def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>;
-// TODO: Do these actually match the regular fmin/fmax behavior?
-def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>;
-def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>;
-// According to https://msdn.microsoft.com/en-us/library/windows/desktop/cc308050%28v=vs.85%29.aspx
-// DX10 min/max returns the other operand if one is NaN,
-// this matches http://llvm.org/docs/LangRef.html#llvm-minnum-intrinsic
-def MAX_DX10 : R600_2OP_Helper <0x5, "MAX_DX10", fmaxnum>;
-def MIN_DX10 : R600_2OP_Helper <0x6, "MIN_DX10", fminnum>;
-
-// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td,
-// so some of the instruction names don't match the asm string.
-// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
-def SETE : R600_2OP <
-  0x08, "SETE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))]
->;
-
-def SGT : R600_2OP <
-  0x09, "SETGT",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))]
->;
-
-def SGE : R600_2OP <
-  0xA, "SETGE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))]
->;
-
-def SNE : R600_2OP <
-  0xB, "SETNE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))]
->;
-
-def SETE_DX10 : R600_2OP <
-  0xC, "SETE_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OEQ))]
->;
-
-def SETGT_DX10 : R600_2OP <
-  0xD, "SETGT_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGT))]
->;
-
-def SETGE_DX10 : R600_2OP <
-  0xE, "SETGE_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))]
->;
-
-// FIXME: This should probably be COND_ONE
-def SETNE_DX10 : R600_2OP <
-  0xF, "SETNE_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))]
->;
-
-def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
-def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>;
-def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
-def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>;
-def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
-
-def MOV : R600_1OP <0x19, "MOV", []>;
-
-let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
-
-class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <
-  (outs R600_Reg32:$dst),
-  (ins immType:$imm),
-  "",
-  []
->;
-
-} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
-
-def MOV_IMM_I32 : MOV_IMM<i32, i32imm>;
-def : Pat <
-  (imm:$val),
-  (MOV_IMM_I32 imm:$val)
->;
-
-def MOV_IMM_F32 : MOV_IMM<f32, f32imm>;
-def : Pat <
-  (fpimm:$val),
-  (MOV_IMM_F32  fpimm:$val)
->;
-
-def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>;
-def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>;
-def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>;
-def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>;
-
-let hasSideEffects = 1 in {
-
-def KILLGT : R600_2OP <0x2D, "KILLGT", []>;
-
-} // end hasSideEffects
-
-def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>;
-def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>;
-def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>;
-def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>;
-def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>;
-def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>;
-def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", smax>;
-def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", smin>;
-def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", umax>;
-def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", umin>;
-
-def SETE_INT : R600_2OP <
-  0x3A, "SETE_INT",
-  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETEQ))]
->;
-
-def SETGT_INT : R600_2OP <
-  0x3B, "SETGT_INT",
-  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGT))]
->;
-
-def SETGE_INT : R600_2OP <
-  0x3C, "SETGE_INT",
-  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGE))]
->;
-
-def SETNE_INT : R600_2OP <
-  0x3D, "SETNE_INT",
-  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETNE))]
->;
-
-def SETGT_UINT : R600_2OP <
-  0x3E, "SETGT_UINT",
-  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGT))]
->;
-
-def SETGE_UINT : R600_2OP <
-  0x3F, "SETGE_UINT",
-  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGE))]
->;
-
-def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>;
-def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>;
-def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>;
-def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>;
-
-def CNDE_INT : R600_3OP <
-  0x1C, "CNDE_INT",
-  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_EQ))]
->;
-
-def CNDGE_INT : R600_3OP <
-  0x1E, "CNDGE_INT",
-  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGE))]
->;
-
-def CNDGT_INT : R600_3OP <
-  0x1D, "CNDGT_INT",
-  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGT))]
->;
-
-//===----------------------------------------------------------------------===//
-// Texture instructions
-//===----------------------------------------------------------------------===//
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-
-class R600_TEX <bits<11> inst, string opName> :
-  InstR600 <(outs R600_Reg128:$DST_GPR),
-          (ins R600_Reg128:$SRC_GPR,
-          RSel:$srcx, RSel:$srcy, RSel:$srcz, RSel:$srcw,
-          i32imm:$offsetx, i32imm:$offsety, i32imm:$offsetz,
-          RSel:$DST_SEL_X, RSel:$DST_SEL_Y, RSel:$DST_SEL_Z, RSel:$DST_SEL_W,
-          i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID,
-          CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z,
-          CT:$COORD_TYPE_W),
-          !strconcat(opName,
-          " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, "
-          "$SRC_GPR.$srcx$srcy$srcz$srcw "
-          "RID:$RESOURCE_ID SID:$SAMPLER_ID "
-          "CT:$COORD_TYPE_X$COORD_TYPE_Y$COORD_TYPE_Z$COORD_TYPE_W"),
-          [],
-          NullALU>, TEX_WORD0, TEX_WORD1, TEX_WORD2 {
-  let Inst{31-0} = Word0;
-  let Inst{63-32} = Word1;
-
-  let TEX_INST = inst{4-0};
-  let SRC_REL = 0;
-  let DST_REL = 0;
-  let LOD_BIAS = 0;
-
-  let INST_MOD = 0;
-  let FETCH_WHOLE_QUAD = 0;
-  let ALT_CONST = 0;
-  let SAMPLER_INDEX_MODE = 0;
-  let RESOURCE_INDEX_MODE = 0;
-
-  let TEXInst = 1;
-}
-
-} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
-
-
-
-def TEX_SAMPLE : R600_TEX <0x10, "TEX_SAMPLE">;
-def TEX_SAMPLE_C : R600_TEX <0x18, "TEX_SAMPLE_C">;
-def TEX_SAMPLE_L : R600_TEX <0x11, "TEX_SAMPLE_L">;
-def TEX_SAMPLE_C_L : R600_TEX <0x19, "TEX_SAMPLE_C_L">;
-def TEX_SAMPLE_LB : R600_TEX <0x12, "TEX_SAMPLE_LB">;
-def TEX_SAMPLE_C_LB : R600_TEX <0x1A, "TEX_SAMPLE_C_LB">;
-def TEX_LD : R600_TEX <0x03, "TEX_LD">;
-def TEX_LDPTR : R600_TEX <0x03, "TEX_LDPTR"> {
-  let INST_MOD = 1;
-}
-def TEX_GET_TEXTURE_RESINFO : R600_TEX <0x04, "TEX_GET_TEXTURE_RESINFO">;
-def TEX_GET_GRADIENTS_H : R600_TEX <0x07, "TEX_GET_GRADIENTS_H">;
-def TEX_GET_GRADIENTS_V : R600_TEX <0x08, "TEX_GET_GRADIENTS_V">;
-def TEX_SET_GRADIENTS_H : R600_TEX <0x0B, "TEX_SET_GRADIENTS_H">;
-def TEX_SET_GRADIENTS_V : R600_TEX <0x0C, "TEX_SET_GRADIENTS_V">;
-def TEX_SAMPLE_G : R600_TEX <0x14, "TEX_SAMPLE_G">;
-def TEX_SAMPLE_C_G : R600_TEX <0x1C, "TEX_SAMPLE_C_G">;
-
-defm : TexPattern<0, TEX_SAMPLE>;
-defm : TexPattern<1, TEX_SAMPLE_C>;
-defm : TexPattern<2, TEX_SAMPLE_L>;
-defm : TexPattern<3, TEX_SAMPLE_C_L>;
-defm : TexPattern<4, TEX_SAMPLE_LB>;
-defm : TexPattern<5, TEX_SAMPLE_C_LB>;
-defm : TexPattern<6, TEX_LD, v4i32>;
-defm : TexPattern<7, TEX_GET_TEXTURE_RESINFO, v4i32>;
-defm : TexPattern<8, TEX_GET_GRADIENTS_H>;
-defm : TexPattern<9, TEX_GET_GRADIENTS_V>;
-defm : TexPattern<10, TEX_LDPTR, v4i32>;
-
-//===----------------------------------------------------------------------===//
-// Helper classes for common instructions
-//===----------------------------------------------------------------------===//
-
-class MUL_LIT_Common <bits<5> inst> : R600_3OP <
-  inst, "MUL_LIT",
-  []
->;
-
-class MULADD_Common <bits<5> inst> : R600_3OP <
-  inst, "MULADD",
-  []
->;
-
-class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
-  inst, "MULADD_IEEE",
-  [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))]
->;
-
-class FMA_Common <bits<5> inst> : R600_3OP <
-  inst, "FMA",
-  [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU
->;
-
-class CNDE_Common <bits<5> inst> : R600_3OP <
-  inst, "CNDE",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))]
->;
-
-class CNDGT_Common <bits<5> inst> : R600_3OP <
-  inst, "CNDGT",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))]
-> {
-  let Itinerary = VecALU;
-}
-
-class CNDGE_Common <bits<5> inst> : R600_3OP <
-  inst, "CNDGE",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))]
-> {
-  let Itinerary = VecALU;
-}
-
-
-let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
-class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
-// Slot X
-   UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
-   OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X,
-   R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X,
-   R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X,
-   R600_Pred:$pred_sel_X,
-// Slot Y
-   UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y,
-   OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y,
-   R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y,
-   R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y,
-   R600_Pred:$pred_sel_Y,
-// Slot Z
-   UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z,
-   OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z,
-   R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z,
-   R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z,
-   R600_Pred:$pred_sel_Z,
-// Slot W
-   UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W,
-   OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W,
-   R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W,
-   R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W,
-   R600_Pred:$pred_sel_W,
-   LITERAL:$literal0, LITERAL:$literal1),
-  "",
-  pattern,
-  AnyALU> {
-
-  let UseNamedOperandTable = 1;
-
-}
-}
-
-def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4
-  R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X,
-  R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y,
-  R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z,
-  R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>;
-
-
-class DOT4_Common <bits<11> inst> : R600_2OP <inst, "DOT4", []>;
-
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-multiclass CUBE_Common <bits<11> inst> {
-
-  def _pseudo : InstR600 <
-    (outs R600_Reg128:$dst),
-    (ins R600_Reg128:$src0),
-    "CUBE $dst $src0",
-    [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))],
-    VecALU
-  > {
-    let isPseudo = 1;
-    let UseNamedOperandTable = 1;
-  }
-
-  def _real : R600_2OP <inst, "CUBE", []>;
-}
-} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
-
-class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "EXP_IEEE", fexp2
-> {
-  let Itinerary = TransALU;
-}
-
-class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "FLT_TO_INT", fp_to_sint
-> {
-  let Itinerary = TransALU;
-}
-
-class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "INT_TO_FLT", sint_to_fp
-> {
-  let Itinerary = TransALU;
-}
-
-class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "FLT_TO_UINT", fp_to_uint
-> {
-  let Itinerary = TransALU;
-}
-
-class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "UINT_TO_FLT", uint_to_fp
-> {
-  let Itinerary = TransALU;
-}
-
-class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
-  inst, "LOG_CLAMPED", []
->;
-
-class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "LOG_IEEE", flog2
-> {
-  let Itinerary = TransALU;
-}
-
-class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>;
-class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>;
-class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
-class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
-  inst, "MULHI_INT", mulhs
-> {
-  let Itinerary = TransALU;
-}
-class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
-  inst, "MULHI", mulhu
-> {
-  let Itinerary = TransALU;
-}
-class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
-  inst, "MULLO_INT", mul
-> {
-  let Itinerary = TransALU;
-}
-class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> {
-  let Itinerary = TransALU;
-}
-
-class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
-  inst, "RECIP_CLAMPED", []
-> {
-  let Itinerary = TransALU;
-}
-
-class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
-  inst, "RECIP_IEEE", [(set f32:$dst, (AMDGPUrcp f32:$src0))]
-> {
-  let Itinerary = TransALU;
-}
-
-class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "RECIP_UINT", AMDGPUurecip
-> {
-  let Itinerary = TransALU;
-}
-
-// Clamped to maximum.
-class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped
-> {
-  let Itinerary = TransALU;
-}
-
-class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy
-> {
-  let Itinerary = TransALU;
-}
-
-// TODO: There is also RECIPSQRT_FF which clamps to zero.
-
-class SIN_Common <bits<11> inst> : R600_1OP <
-  inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{
-  let Trig = 1;
-  let Itinerary = TransALU;
-}
-
-class COS_Common <bits<11> inst> : R600_1OP <
-  inst, "COS", [(set f32:$dst, (COS_HW f32:$src0))]> {
-  let Trig = 1;
-  let Itinerary = TransALU;
-}
-
-def CLAMP_R600 :  CLAMP <R600_Reg32>;
-def FABS_R600 : FABS<R600_Reg32>;
-def FNEG_R600 : FNEG<R600_Reg32>;
-
-//===----------------------------------------------------------------------===//
-// Helper patterns for complex intrinsics
-//===----------------------------------------------------------------------===//
-
-// FIXME: Should be predicated on unsafe fp math.
-multiclass DIV_Common <InstR600 recip_ieee> {
-def : Pat<
-  (int_AMDGPU_div f32:$src0, f32:$src1),
-  (MUL_IEEE $src0, (recip_ieee $src1))
->;
-
-def : Pat<
-  (fdiv f32:$src0, f32:$src1),
-  (MUL_IEEE $src0, (recip_ieee $src1))
->;
-
-def : RcpPat<recip_ieee, f32>;
-}
-
-class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee>
-  : Pat <
-  (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w),
-  (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x))
->;
-
-//===----------------------------------------------------------------------===//
-// R600 / R700 Instructions
-//===----------------------------------------------------------------------===//
-
-let Predicates = [isR600] in {
-
-  def MUL_LIT_r600 : MUL_LIT_Common<0x0C>;
-  def MULADD_r600 : MULADD_Common<0x10>;
-  def MULADD_IEEE_r600 : MULADD_IEEE_Common<0x14>;
-  def CNDE_r600 : CNDE_Common<0x18>;
-  def CNDGT_r600 : CNDGT_Common<0x19>;
-  def CNDGE_r600 : CNDGE_Common<0x1A>;
-  def DOT4_r600 : DOT4_Common<0x50>;
-  defm CUBE_r600 : CUBE_Common<0x52>;
-  def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
-  def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
-  def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>;
-  def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>;
-  def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>;
-  def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>;
-  def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>;
-  def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>;
-  def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>;
-  def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>;
-  def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>;
-  def SIN_r600 : SIN_Common<0x6E>;
-  def COS_r600 : COS_Common<0x6F>;
-  def ASHR_r600 : ASHR_Common<0x70>;
-  def LSHR_r600 : LSHR_Common<0x71>;
-  def LSHL_r600 : LSHL_Common<0x72>;
-  def MULLO_INT_r600 : MULLO_INT_Common<0x73>;
-  def MULHI_INT_r600 : MULHI_INT_Common<0x74>;
-  def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>;
-  def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>;
-  def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>;
-
-  defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
-  def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
-  def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
-
-  def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
-  def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
-
-  def R600_ExportSwz : ExportSwzInst {
-    let Word1{20-17} = 0; // BURST_COUNT
-    let Word1{21} = eop;
-    let Word1{22} = 0; // VALID_PIXEL_MODE
-    let Word1{30-23} = inst;
-    let Word1{31} = 1; // BARRIER
-  }
-  defm : ExportPattern<R600_ExportSwz, 39>;
-
-  def R600_ExportBuf : ExportBufInst {
-    let Word1{20-17} = 0; // BURST_COUNT
-    let Word1{21} = eop;
-    let Word1{22} = 0; // VALID_PIXEL_MODE
-    let Word1{30-23} = inst;
-    let Word1{31} = 1; // BARRIER
-  }
-  defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>;
-
-  def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$CNT),
-  "TEX $CNT @$ADDR"> {
-    let POP_COUNT = 0;
-  }
-  def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$CNT),
-  "VTX $CNT @$ADDR"> {
-    let POP_COUNT = 0;
-  }
-  def WHILE_LOOP_R600 : CF_CLAUSE_R600<6, (ins i32imm:$ADDR),
-  "LOOP_START_DX10 @$ADDR"> {
-    let POP_COUNT = 0;
-    let CNT = 0;
-  }
-  def END_LOOP_R600 : CF_CLAUSE_R600<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
-    let POP_COUNT = 0;
-    let CNT = 0;
-  }
-  def LOOP_BREAK_R600 : CF_CLAUSE_R600<9, (ins i32imm:$ADDR),
-  "LOOP_BREAK @$ADDR"> {
-    let POP_COUNT = 0;
-    let CNT = 0;
-  }
-  def CF_CONTINUE_R600 : CF_CLAUSE_R600<8, (ins i32imm:$ADDR),
-  "CONTINUE @$ADDR"> {
-    let POP_COUNT = 0;
-    let CNT = 0;
-  }
-  def CF_JUMP_R600 : CF_CLAUSE_R600<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
-  "JUMP @$ADDR POP:$POP_COUNT"> {
-    let CNT = 0;
-  }
-  def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR),
-  "PUSH_ELSE @$ADDR"> {
-    let CNT = 0;
-    let POP_COUNT = 0; // FIXME?
-  }
-  def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
-  "ELSE @$ADDR POP:$POP_COUNT"> {
-    let CNT = 0;
-  }
-  def CF_CALL_FS_R600 : CF_CLAUSE_R600<19, (ins), "CALL_FS"> {
-    let ADDR = 0;
-    let CNT = 0;
-    let POP_COUNT = 0;
-  }
-  def POP_R600 : CF_CLAUSE_R600<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
-  "POP @$ADDR POP:$POP_COUNT"> {
-    let CNT = 0;
-  }
-  def CF_END_R600 : CF_CLAUSE_R600<0, (ins), "CF_END"> {
-    let CNT = 0;
-    let POP_COUNT = 0;
-    let ADDR = 0;
-    let END_OF_PROGRAM = 1;
-  }
-
-}
-
-
-//===----------------------------------------------------------------------===//
-// Regist loads and stores - for indirect addressing
-//===----------------------------------------------------------------------===//
-
-defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
-
-
-//===----------------------------------------------------------------------===//
-// Pseudo instructions
-//===----------------------------------------------------------------------===//
-
-let isPseudo = 1 in {
-
-def PRED_X : InstR600 <
-  (outs R600_Predicate_Bit:$dst),
-  (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
-  "", [], NullALU> {
-  let FlagOperandIdx = 3;
-}
-
-let isTerminator = 1, isBranch = 1 in {
-def JUMP_COND : InstR600 <
-          (outs),
-          (ins brtarget:$target, R600_Predicate_Bit:$p),
-          "JUMP $target ($p)",
-          [], AnyALU
-  >;
-
-def JUMP : InstR600 <
-          (outs),
-          (ins brtarget:$target),
-          "JUMP $target",
-          [], AnyALU
-  >
-{
-  let isPredicable = 1;
-  let isBarrier = 1;
-}
-
-}  // End isTerminator = 1, isBranch = 1
-
-let usesCustomInserter = 1 in {
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
-
-def MASK_WRITE : AMDGPUShaderInst <
-    (outs),
-    (ins R600_Reg32:$src),
-    "MASK_WRITE $src",
-    []
->;
-
-} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
-
-
-def TXD: InstR600 <
-  (outs R600_Reg128:$dst),
-  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
-       i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
-  "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
-  [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
-                     imm:$resourceId, imm:$samplerId, imm:$textureTarget))],
-  NullALU > {
-  let TEXInst = 1;
-}
-
-def TXD_SHADOW: InstR600 <
-  (outs R600_Reg128:$dst),
-  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
-       i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
-  "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
-  [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
-        imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))],
-   NullALU
-> {
-  let TEXInst = 1;
-}
-} // End isPseudo = 1
-} // End usesCustomInserter = 1
-
-
-//===----------------------------------------------------------------------===//
-// Constant Buffer Addressing Support
-//===----------------------------------------------------------------------===//
-
-let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
-def CONST_COPY : Instruction {
-  let OutOperandList = (outs R600_Reg32:$dst);
-  let InOperandList = (ins i32imm:$src);
-  let Pattern =
-      [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
-  let AsmString = "CONST_COPY";
-  let hasSideEffects = 0;
-  let isAsCheapAsAMove = 1;
-  let Itinerary = NullALU;
-}
-} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
-
-def TEX_VTX_CONSTBUF :
-  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr",
-      [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>,
-  VTX_WORD1_GPR, VTX_WORD0_eg {
-
-  let VC_INST = 0;
-  let FETCH_TYPE = 2;
-  let FETCH_WHOLE_QUAD = 0;
-  let SRC_REL = 0;
-  let SRC_SEL_X = 0;
-  let DST_REL = 0;
-  let USE_CONST_FIELDS = 0;
-  let NUM_FORMAT_ALL = 2;
-  let FORMAT_COMP_ALL = 1;
-  let SRF_MODE_ALL = 1;
-  let MEGA_FETCH_COUNT = 16;
-  let DST_SEL_X        = 0;
-  let DST_SEL_Y        = 1;
-  let DST_SEL_Z        = 2;
-  let DST_SEL_W        = 3;
-  let DATA_FORMAT      = 35;
-
-  let Inst{31-0} = Word0;
-  let Inst{63-32} = Word1;
-
-// LLVM can only encode 64-bit instructions, so these fields are manually
-// encoded in R600CodeEmitter
-//
-// bits<16> OFFSET;
-// bits<2>  ENDIAN_SWAP = 0;
-// bits<1>  CONST_BUF_NO_STRIDE = 0;
-// bits<1>  MEGA_FETCH = 0;
-// bits<1>  ALT_CONST = 0;
-// bits<2>  BUFFER_INDEX_MODE = 0;
-
-
-
-// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
-// is done in R600CodeEmitter
-//
-// Inst{79-64} = OFFSET;
-// Inst{81-80} = ENDIAN_SWAP;
-// Inst{82}    = CONST_BUF_NO_STRIDE;
-// Inst{83}    = MEGA_FETCH;
-// Inst{84}    = ALT_CONST;
-// Inst{86-85} = BUFFER_INDEX_MODE;
-// Inst{95-86} = 0; Reserved
-
-// VTX_WORD3 (Padding)
-//
-// Inst{127-96} = 0;
-  let VTXInst = 1;
-}
-
-def TEX_VTX_TEXBUF:
-  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr",
-      [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>,
-VTX_WORD1_GPR, VTX_WORD0_eg {
-
-let VC_INST = 0;
-let FETCH_TYPE = 2;
-let FETCH_WHOLE_QUAD = 0;
-let SRC_REL = 0;
-let SRC_SEL_X = 0;
-let DST_REL = 0;
-let USE_CONST_FIELDS = 1;
-let NUM_FORMAT_ALL = 0;
-let FORMAT_COMP_ALL = 0;
-let SRF_MODE_ALL = 1;
-let MEGA_FETCH_COUNT = 16;
-let DST_SEL_X        = 0;
-let DST_SEL_Y        = 1;
-let DST_SEL_Z        = 2;
-let DST_SEL_W        = 3;
-let DATA_FORMAT      = 0;
-
-let Inst{31-0} = Word0;
-let Inst{63-32} = Word1;
-
-// LLVM can only encode 64-bit instructions, so these fields are manually
-// encoded in R600CodeEmitter
-//
-// bits<16> OFFSET;
-// bits<2>  ENDIAN_SWAP = 0;
-// bits<1>  CONST_BUF_NO_STRIDE = 0;
-// bits<1>  MEGA_FETCH = 0;
-// bits<1>  ALT_CONST = 0;
-// bits<2>  BUFFER_INDEX_MODE = 0;
-
-
-
-// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
-// is done in R600CodeEmitter
-//
-// Inst{79-64} = OFFSET;
-// Inst{81-80} = ENDIAN_SWAP;
-// Inst{82}    = CONST_BUF_NO_STRIDE;
-// Inst{83}    = MEGA_FETCH;
-// Inst{84}    = ALT_CONST;
-// Inst{86-85} = BUFFER_INDEX_MODE;
-// Inst{95-86} = 0; Reserved
-
-// VTX_WORD3 (Padding)
-//
-// Inst{127-96} = 0;
-  let VTXInst = 1;
-}
-
-//===---------------------------------------------------------------------===//
-// Flow and Program control Instructions
-//===---------------------------------------------------------------------===//
-class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
-: Instruction {
-
-     let Namespace = "AMDGPU";
-     dag OutOperandList = outs;
-     dag InOperandList = ins;
-     let Pattern = pattern;
-     let AsmString = !strconcat(asmstr, "\n");
-     let isPseudo = 1;
-     let Itinerary = NullALU;
-     bit hasIEEEFlag = 0;
-     bit hasZeroOpFlag = 0;
-     let mayLoad = 0;
-     let mayStore = 0;
-     let hasSideEffects = 0;
-     let isCodeGenOnly = 1;
-}
-
-multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
-    def _i32 : ILFormat<(outs),
-  (ins brtarget:$target, rci:$src0),
-        "; i32 Pseudo branch instruction",
-  [(Op bb:$target, (i32 rci:$src0))]>;
-    def _f32 : ILFormat<(outs),
-  (ins brtarget:$target, rcf:$src0),
-        "; f32 Pseudo branch instruction",
-  [(Op bb:$target, (f32 rcf:$src0))]>;
-}
-
-// Only scalar types should generate flow control
-multiclass BranchInstr<string name> {
-  def _i32 : ILFormat<(outs), (ins R600_Reg32:$src),
-      !strconcat(name, " $src"), []>;
-  def _f32 : ILFormat<(outs), (ins R600_Reg32:$src),
-      !strconcat(name, " $src"), []>;
-}
-// Only scalar types should generate flow control
-multiclass BranchInstr2<string name> {
-  def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
-      !strconcat(name, " $src0, $src1"), []>;
-  def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
-      !strconcat(name, " $src0, $src1"), []>;
-}
-
-//===---------------------------------------------------------------------===//
-// Custom Inserter for Branches and returns, this eventually will be a
-// separate pass
-//===---------------------------------------------------------------------===//
-let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
-  def BRANCH : ILFormat<(outs), (ins brtarget:$target),
-      "; Pseudo unconditional branch instruction",
-      [(br bb:$target)]>;
-  defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>;
-}
-
-//===---------------------------------------------------------------------===//
-// Return instruction
-//===---------------------------------------------------------------------===//
-let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
-    usesCustomInserter = 1 in {
-  def RETURN          : ILFormat<(outs), (ins variable_ops),
-      "RETURN", [(IL_retflag)]>;
-}
-
-//===----------------------------------------------------------------------===//
-// Branch Instructions
-//===----------------------------------------------------------------------===//
-
-def IF_PREDICATE_SET  : ILFormat<(outs), (ins R600_Reg32:$src),
-  "IF_PREDICATE_SET $src", []>;
-
-let isTerminator=1 in {
-  def BREAK       : ILFormat< (outs), (ins),
-      "BREAK", []>;
-  def CONTINUE    : ILFormat< (outs), (ins),
-      "CONTINUE", []>;
-  def DEFAULT     : ILFormat< (outs), (ins),
-      "DEFAULT", []>;
-  def ELSE        : ILFormat< (outs), (ins),
-      "ELSE", []>;
-  def ENDSWITCH   : ILFormat< (outs), (ins),
-      "ENDSWITCH", []>;
-  def ENDMAIN     : ILFormat< (outs), (ins),
-      "ENDMAIN", []>;
-  def END         : ILFormat< (outs), (ins),
-      "END", []>;
-  def ENDFUNC     : ILFormat< (outs), (ins),
-      "ENDFUNC", []>;
-  def ENDIF       : ILFormat< (outs), (ins),
-      "ENDIF", []>;
-  def WHILELOOP   : ILFormat< (outs), (ins),
-      "WHILE", []>;
-  def ENDLOOP     : ILFormat< (outs), (ins),
-      "ENDLOOP", []>;
-  def FUNC        : ILFormat< (outs), (ins),
-      "FUNC", []>;
-  def RETDYN      : ILFormat< (outs), (ins),
-      "RET_DYN", []>;
-  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
-  defm IF_LOGICALNZ  : BranchInstr<"IF_LOGICALNZ">;
-  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
-  defm IF_LOGICALZ   : BranchInstr<"IF_LOGICALZ">;
-  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
-  defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">;
-  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
-  defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">;
-  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
-  defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">;
-  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
-  defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">;
-  defm IFC         : BranchInstr2<"IFC">;
-  defm BREAKC      : BranchInstr2<"BREAKC">;
-  defm CONTINUEC   : BranchInstr2<"CONTINUEC">;
-}
-
-//===----------------------------------------------------------------------===//
-// Indirect addressing pseudo instructions
-//===----------------------------------------------------------------------===//
-
-let isPseudo = 1 in {
-
-class ExtractVertical <RegisterClass vec_rc> : InstR600 <
-  (outs R600_Reg32:$dst),
-  (ins vec_rc:$vec, R600_Reg32:$index), "",
-  [],
-  AnyALU
->;
-
-let Constraints = "$dst = $vec" in {
-
-class InsertVertical <RegisterClass vec_rc> : InstR600 <
-  (outs vec_rc:$dst),
-  (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "",
-  [],
-  AnyALU
->;
-
-} // End Constraints = "$dst = $vec"
-
-} // End isPseudo = 1
-
-def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>;
-def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>;
-
-def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>;
-def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>;
-
-class ExtractVerticalPat <Instruction inst, ValueType vec_ty,
-                          ValueType scalar_ty> : Pat <
-  (scalar_ty (extractelt vec_ty:$vec, i32:$index)),
-  (inst $vec, $index)
->;
-
-def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>;
-def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>;
-def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>;
-def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>;
-
-class InsertVerticalPat <Instruction inst, ValueType vec_ty,
-                         ValueType scalar_ty> : Pat <
-  (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)),
-  (inst $vec, $value, $index)
->;
-
-def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>;
-def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>;
-def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>;
-def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
-
-//===----------------------------------------------------------------------===//
-// ISel Patterns
-//===----------------------------------------------------------------------===//
-
-// CND*_INT Pattterns for f32 True / False values
-
-class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat <
-  (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc),
-  (cnd $src0, $src1, $src2)
->;
-
-def : CND_INT_f32 <CNDE_INT,  SETEQ>;
-def : CND_INT_f32 <CNDGT_INT, SETGT>;
-def : CND_INT_f32 <CNDGE_INT, SETGE>;
-
-//CNDGE_INT extra pattern
-def : Pat <
-  (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT),
-  (CNDGE_INT $src0, $src1, $src2)
->;
-
-// KIL Patterns
-def KILP : Pat <
-  (int_AMDGPU_kilp),
-  (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
->;
-
-def KIL : Pat <
-  (int_AMDGPU_kill f32:$src0),
-  (MASK_WRITE (KILLGT (f32 ZERO), $src0))
->;
-
-def : Extract_Element <f32, v4f32, 0, sub0>;
-def : Extract_Element <f32, v4f32, 1, sub1>;
-def : Extract_Element <f32, v4f32, 2, sub2>;
-def : Extract_Element <f32, v4f32, 3, sub3>;
-
-def : Insert_Element <f32, v4f32, 0, sub0>;
-def : Insert_Element <f32, v4f32, 1, sub1>;
-def : Insert_Element <f32, v4f32, 2, sub2>;
-def : Insert_Element <f32, v4f32, 3, sub3>;
-
-def : Extract_Element <i32, v4i32, 0, sub0>;
-def : Extract_Element <i32, v4i32, 1, sub1>;
-def : Extract_Element <i32, v4i32, 2, sub2>;
-def : Extract_Element <i32, v4i32, 3, sub3>;
-
-def : Insert_Element <i32, v4i32, 0, sub0>;
-def : Insert_Element <i32, v4i32, 1, sub1>;
-def : Insert_Element <i32, v4i32, 2, sub2>;
-def : Insert_Element <i32, v4i32, 3, sub3>;
-
-def : Extract_Element <f32, v2f32, 0, sub0>;
-def : Extract_Element <f32, v2f32, 1, sub1>;
-
-def : Insert_Element <f32, v2f32, 0, sub0>;
-def : Insert_Element <f32, v2f32, 1, sub1>;
-
-def : Extract_Element <i32, v2i32, 0, sub0>;
-def : Extract_Element <i32, v2i32, 1, sub1>;
-
-def : Insert_Element <i32, v2i32, 0, sub0>;
-def : Insert_Element <i32, v2i32, 1, sub1>;
-
-// bitconvert patterns
-
-def : BitConvert <i32, f32, R600_Reg32>;
-def : BitConvert <f32, i32, R600_Reg32>;
-def : BitConvert <v2f32, v2i32, R600_Reg64>;
-def : BitConvert <v2i32, v2f32, R600_Reg64>;
-def : BitConvert <v4f32, v4i32, R600_Reg128>;
-def : BitConvert <v4i32, v4f32, R600_Reg128>;
-
-// DWORDADDR pattern
-def : DwordAddrPat  <i32, R600_Reg32>;
-
-} // End isR600toCayman Predicate
-
-let Predicates = [isR600] in {
-// Intrinsic patterns
-defm : Expand24IBitOps<MULLO_INT_r600, ADD_INT>;
-defm : Expand24UBitOps<MULLO_UINT_r600, ADD_INT>;
-} // End isR600
-
-def getLDSNoRetOp : InstrMapping {
-  let FilterClass = "R600_LDS_1A1D";
-  let RowFields = ["BaseOp"];
-  let ColFields = ["DisableEncoding"];
-  let KeyCol = ["$dst"];
-  let ValueCols = [[""""]];
-}
diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td
deleted file mode 100644
index 9681747006d..00000000000
--- a/lib/Target/R600/R600Intrinsics.td
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// R600 Intrinsic Definitions
-//
-//===----------------------------------------------------------------------===//
-
-let TargetPrefix = "R600", isTarget = 1 in {
-  class TextureIntrinsicFloatInput :
-    Intrinsic<[llvm_v4f32_ty], [
-      llvm_v4f32_ty, // Coord
-      llvm_i32_ty, // offset_x
-      llvm_i32_ty, // offset_y,
-      llvm_i32_ty, // offset_z,
-      llvm_i32_ty, // resource_id
-      llvm_i32_ty, // samplerid
-      llvm_i32_ty, // coord_type_x
-      llvm_i32_ty, // coord_type_y
-      llvm_i32_ty, // coord_type_z
-      llvm_i32_ty // coord_type_w
-    ], [IntrNoMem]>;
-  class TextureIntrinsicInt32Input :
-    Intrinsic<[llvm_v4i32_ty], [
-      llvm_v4i32_ty, // Coord
-      llvm_i32_ty, // offset_x
-      llvm_i32_ty, // offset_y,
-      llvm_i32_ty, // offset_z,
-      llvm_i32_ty, // resource_id
-      llvm_i32_ty, // samplerid
-      llvm_i32_ty, // coord_type_x
-      llvm_i32_ty, // coord_type_y
-      llvm_i32_ty, // coord_type_z
-      llvm_i32_ty // coord_type_w
-    ], [IntrNoMem]>;
-
-  def int_R600_load_input :
-    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_R600_interp_input :
-    Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_R600_interp_const :
-    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_R600_interp_xy :
-    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-def int_R600_interp_zw :
-    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_R600_load_texbuf :
-    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_R600_tex : TextureIntrinsicFloatInput;
-  def int_R600_texc : TextureIntrinsicFloatInput;
-  def int_R600_txl : TextureIntrinsicFloatInput;
-  def int_R600_txlc : TextureIntrinsicFloatInput;
-  def int_R600_txb : TextureIntrinsicFloatInput;
-  def int_R600_txbc : TextureIntrinsicFloatInput;
-  def int_R600_txf : TextureIntrinsicInt32Input;
-  def int_R600_ldptr : TextureIntrinsicInt32Input;
-  def int_R600_txq : TextureIntrinsicInt32Input;
-  def int_R600_ddx : TextureIntrinsicFloatInput;
-  def int_R600_ddy : TextureIntrinsicFloatInput;
-  def int_R600_store_swizzle :
-    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-  def int_R600_store_stream_output :
-    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-  def int_R600_store_pixel_depth :
-      Intrinsic<[], [llvm_float_ty], []>;
-  def int_R600_store_pixel_stencil :
-      Intrinsic<[], [llvm_float_ty], []>;
-  def int_R600_store_dummy :
-      Intrinsic<[], [llvm_i32_ty], []>;
-}
diff --git a/lib/Target/R600/R600MachineFunctionInfo.cpp b/lib/Target/R600/R600MachineFunctionInfo.cpp
deleted file mode 100644
index 01105c614c5..00000000000
--- a/lib/Target/R600/R600MachineFunctionInfo.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//===----------------------------------------------------------------------===//
-
-#include "R600MachineFunctionInfo.h"
-
-using namespace llvm;
-
-
-// Pin the vtable to this file.
-void R600MachineFunctionInfo::anchor() {}
-
-R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
-  : AMDGPUMachineFunction(MF) { }
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
deleted file mode 100644
index 263561edd30..00000000000
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H
-#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H
-
-#include "AMDGPUMachineFunction.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include <vector>
-
-namespace llvm {
-
-class R600MachineFunctionInfo : public AMDGPUMachineFunction {
-  void anchor() override;
-public:
-  R600MachineFunctionInfo(const MachineFunction &MF);
-  SmallVector<unsigned, 4> LiveOuts;
-  std::vector<unsigned> IndirectRegs;
-  unsigned StackSize;
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
deleted file mode 100644
index bcde5fb50da..00000000000
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ /dev/null
@@ -1,469 +0,0 @@
-//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief R600 Machine Scheduler interface
-//
-//===----------------------------------------------------------------------===//
-
-#include "R600MachineScheduler.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "misched"
-
-void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
-  assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
-  DAG = static_cast<ScheduleDAGMILive*>(dag);
-  const AMDGPUSubtarget &ST = DAG->MF.getSubtarget<AMDGPUSubtarget>();
-  TII = static_cast<const R600InstrInfo*>(DAG->TII);
-  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
-  VLIW5 = !ST.hasCaymanISA();
-  MRI = &DAG->MRI;
-  CurInstKind = IDOther;
-  CurEmitted = 0;
-  OccupedSlotsMask = 31;
-  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
-  InstKindLimit[IDOther] = 32;
-  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
-  AluInstCount = 0;
-  FetchInstCount = 0;
-}
-
-void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
-                                  std::vector<SUnit *> &QDst)
-{
-  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
-  QSrc.clear();
-}
-
-static
-unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
-  assert (GPRCount && "GPRCount cannot be 0");
-  return 248 / GPRCount;
-}
-
-SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
-  SUnit *SU = nullptr;
-  NextInstKind = IDOther;
-
-  IsTopNode = false;
-
-  // check if we might want to switch current clause type
-  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
-      (Available[CurInstKind].empty());
-  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
-      (!Available[IDFetch].empty() || !Available[IDOther].empty());
-
-  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
-    // We use the heuristic provided by AMD Accelerated Parallel Processing
-    // OpenCL Programming Guide :
-    // The approx. number of WF that allows TEX inst to hide ALU inst is :
-    // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
-    float ALUFetchRationEstimate =
-        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
-        (FetchInstCount + Available[IDFetch].size());
-    if (ALUFetchRationEstimate == 0) {
-      AllowSwitchFromAlu = true;
-    } else {
-      unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
-      DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
-      // We assume the local GPR requirements to be "dominated" by the requirement
-      // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
-      // after TEX are indeed likely to consume or generate values from/for the
-      // TEX clause.
-      // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
-      // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
-      // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
-      // (TODO : use RegisterPressure)
-      // If we are going too use too many GPR, we flush Fetch instruction to lower
-      // register pressure on 128 bits regs.
-      unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
-      if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
-        AllowSwitchFromAlu = true;
-    }
-  }
-
-  if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
-      (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
-    // try to pick ALU
-    SU = pickAlu();
-    if (!SU && !PhysicalRegCopy.empty()) {
-      SU = PhysicalRegCopy.front();
-      PhysicalRegCopy.erase(PhysicalRegCopy.begin());
-    }
-    if (SU) {
-      if (CurEmitted >= InstKindLimit[IDAlu])
-        CurEmitted = 0;
-      NextInstKind = IDAlu;
-    }
-  }
-
-  if (!SU) {
-    // try to pick FETCH
-    SU = pickOther(IDFetch);
-    if (SU)
-      NextInstKind = IDFetch;
-  }
-
-  // try to pick other
-  if (!SU) {
-    SU = pickOther(IDOther);
-    if (SU)
-      NextInstKind = IDOther;
-  }
-
-  DEBUG(
-      if (SU) {
-        dbgs() << " ** Pick node **\n";
-        SU->dump(DAG);
-      } else {
-        dbgs() << "NO NODE \n";
-        for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
-          const SUnit &S = DAG->SUnits[i];
-          if (!S.isScheduled)
-            S.dump(DAG);
-        }
-      }
-  );
-
-  return SU;
-}
-
-void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
-  if (NextInstKind != CurInstKind) {
-    DEBUG(dbgs() << "Instruction Type Switch\n");
-    if (NextInstKind != IDAlu)
-      OccupedSlotsMask |= 31;
-    CurEmitted = 0;
-    CurInstKind = NextInstKind;
-  }
-
-  if (CurInstKind == IDAlu) {
-    AluInstCount ++;
-    switch (getAluKind(SU)) {
-    case AluT_XYZW:
-      CurEmitted += 4;
-      break;
-    case AluDiscarded:
-      break;
-    default: {
-      ++CurEmitted;
-      for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
-          E = SU->getInstr()->operands_end(); It != E; ++It) {
-        MachineOperand &MO = *It;
-        if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
-          ++CurEmitted;
-      }
-    }
-    }
-  } else {
-    ++CurEmitted;
-  }
-
-
-  DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
-
-  if (CurInstKind != IDFetch) {
-    MoveUnits(Pending[IDFetch], Available[IDFetch]);
-  } else
-    FetchInstCount++;
-}
-
-static bool
-isPhysicalRegCopy(MachineInstr *MI) {
-  if (MI->getOpcode() != AMDGPU::COPY)
-    return false;
-
-  return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
-}
-
-void R600SchedStrategy::releaseTopNode(SUnit *SU) {
-  DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
-}
-
-void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
-  DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
-  if (isPhysicalRegCopy(SU->getInstr())) {
-    PhysicalRegCopy.push_back(SU);
-    return;
-  }
-
-  int IK = getInstKind(SU);
-
-  // There is no export clause, we can schedule one as soon as its ready
-  if (IK == IDOther)
-    Available[IDOther].push_back(SU);
-  else
-    Pending[IK].push_back(SU);
-
-}
-
-bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
-                                          const TargetRegisterClass *RC) const {
-  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
-    return RC->contains(Reg);
-  } else {
-    return MRI->getRegClass(Reg) == RC;
-  }
-}
-
-R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
-  MachineInstr *MI = SU->getInstr();
-
-  if (TII->isTransOnly(MI))
-    return AluTrans;
-
-    switch (MI->getOpcode()) {
-    case AMDGPU::PRED_X:
-      return AluPredX;
-    case AMDGPU::INTERP_PAIR_XY:
-    case AMDGPU::INTERP_PAIR_ZW:
-    case AMDGPU::INTERP_VEC_LOAD:
-    case AMDGPU::DOT_4:
-      return AluT_XYZW;
-    case AMDGPU::COPY:
-      if (MI->getOperand(1).isUndef()) {
-        // MI will become a KILL, don't considers it in scheduling
-        return AluDiscarded;
-      }
-    default:
-      break;
-    }
-
-    // Does the instruction take a whole IG ?
-    // XXX: Is it possible to add a helper function in R600InstrInfo that can
-    // be used here and in R600PacketizerList::isSoloInstruction() ?
-    if(TII->isVector(*MI) ||
-        TII->isCubeOp(MI->getOpcode()) ||
-        TII->isReductionOp(MI->getOpcode()) ||
-        MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
-      return AluT_XYZW;
-    }
-
-    if (TII->isLDSInstr(MI->getOpcode())) {
-      return AluT_X;
-    }
-
-    // Is the result already assigned to a channel ?
-    unsigned DestSubReg = MI->getOperand(0).getSubReg();
-    switch (DestSubReg) {
-    case AMDGPU::sub0:
-      return AluT_X;
-    case AMDGPU::sub1:
-      return AluT_Y;
-    case AMDGPU::sub2:
-      return AluT_Z;
-    case AMDGPU::sub3:
-      return AluT_W;
-    default:
-      break;
-    }
-
-    // Is the result already member of a X/Y/Z/W class ?
-    unsigned DestReg = MI->getOperand(0).getReg();
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
-        regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
-      return AluT_X;
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
-      return AluT_Y;
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
-      return AluT_Z;
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
-      return AluT_W;
-    if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
-      return AluT_XYZW;
-
-    // LDS src registers cannot be used in the Trans slot.
-    if (TII->readsLDSSrcReg(MI))
-      return AluT_XYZW;
-
-    return AluAny;
-
-}
-
-int R600SchedStrategy::getInstKind(SUnit* SU) {
-  int Opcode = SU->getInstr()->getOpcode();
-
-  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
-    return IDFetch;
-
-  if (TII->isALUInstr(Opcode)) {
-    return IDAlu;
-  }
-
-  switch (Opcode) {
-  case AMDGPU::PRED_X:
-  case AMDGPU::COPY:
-  case AMDGPU::CONST_COPY:
-  case AMDGPU::INTERP_PAIR_XY:
-  case AMDGPU::INTERP_PAIR_ZW:
-  case AMDGPU::INTERP_VEC_LOAD:
-  case AMDGPU::DOT_4:
-    return IDAlu;
-  default:
-    return IDOther;
-  }
-}
-
-SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
-  if (Q.empty())
-    return nullptr;
-  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
-      It != E; ++It) {
-    SUnit *SU = *It;
-    InstructionsGroupCandidate.push_back(SU->getInstr());
-    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)
-        && (!AnyALU || !TII->isVectorOnly(SU->getInstr()))
-    ) {
-      InstructionsGroupCandidate.pop_back();
-      Q.erase((It + 1).base());
-      return SU;
-    } else {
-      InstructionsGroupCandidate.pop_back();
-    }
-  }
-  return nullptr;
-}
-
-void R600SchedStrategy::LoadAlu() {
-  std::vector<SUnit *> &QSrc = Pending[IDAlu];
-  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
-    AluKind AK = getAluKind(QSrc[i]);
-    AvailableAlus[AK].push_back(QSrc[i]);
-  }
-  QSrc.clear();
-}
-
-void R600SchedStrategy::PrepareNextSlot() {
-  DEBUG(dbgs() << "New Slot\n");
-  assert (OccupedSlotsMask && "Slot wasn't filled");
-  OccupedSlotsMask = 0;
-//  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
-//    OccupedSlotsMask |= 16;
-  InstructionsGroupCandidate.clear();
-  LoadAlu();
-}
-
-void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
-  int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
-  if (DstIndex == -1) {
-    return;
-  }
-  unsigned DestReg = MI->getOperand(DstIndex).getReg();
-  // PressureRegister crashes if an operand is def and used in the same inst
-  // and we try to constraint its regclass
-  for (MachineInstr::mop_iterator It = MI->operands_begin(),
-      E = MI->operands_end(); It != E; ++It) {
-    MachineOperand &MO = *It;
-    if (MO.isReg() && !MO.isDef() &&
-        MO.getReg() == DestReg)
-      return;
-  }
-  // Constrains the regclass of DestReg to assign it to Slot
-  switch (Slot) {
-  case 0:
-    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
-    break;
-  case 1:
-    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
-    break;
-  case 2:
-    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
-    break;
-  case 3:
-    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
-    break;
-  }
-}
-
-SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
-  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
-  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
-  if (SlotedSU)
-    return SlotedSU;
-  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
-  if (UnslotedSU)
-    AssignSlot(UnslotedSU->getInstr(), Slot);
-  return UnslotedSU;
-}
-
-unsigned R600SchedStrategy::AvailablesAluCount() const {
-  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
-      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
-      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
-      AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
-      AvailableAlus[AluPredX].size();
-}
-
-SUnit* R600SchedStrategy::pickAlu() {
-  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
-    if (!OccupedSlotsMask) {
-      // Bottom up scheduling : predX must comes first
-      if (!AvailableAlus[AluPredX].empty()) {
-        OccupedSlotsMask |= 31;
-        return PopInst(AvailableAlus[AluPredX], false);
-      }
-      // Flush physical reg copies (RA will discard them)
-      if (!AvailableAlus[AluDiscarded].empty()) {
-        OccupedSlotsMask |= 31;
-        return PopInst(AvailableAlus[AluDiscarded], false);
-      }
-      // If there is a T_XYZW alu available, use it
-      if (!AvailableAlus[AluT_XYZW].empty()) {
-        OccupedSlotsMask |= 15;
-        return PopInst(AvailableAlus[AluT_XYZW], false);
-      }
-    }
-    bool TransSlotOccuped = OccupedSlotsMask & 16;
-    if (!TransSlotOccuped && VLIW5) {
-      if (!AvailableAlus[AluTrans].empty()) {
-        OccupedSlotsMask |= 16;
-        return PopInst(AvailableAlus[AluTrans], false);
-      }
-      SUnit *SU = AttemptFillSlot(3, true);
-      if (SU) {
-        OccupedSlotsMask |= 16;
-        return SU;
-      }
-    }
-    for (int Chan = 3; Chan > -1; --Chan) {
-      bool isOccupied = OccupedSlotsMask & (1 << Chan);
-      if (!isOccupied) {
-        SUnit *SU = AttemptFillSlot(Chan, false);
-        if (SU) {
-          OccupedSlotsMask |= (1 << Chan);
-          InstructionsGroupCandidate.push_back(SU->getInstr());
-          return SU;
-        }
-      }
-    }
-    PrepareNextSlot();
-  }
-  return nullptr;
-}
-
-SUnit* R600SchedStrategy::pickOther(int QID) {
-  SUnit *SU = nullptr;
-  std::vector<SUnit *> &AQ = Available[QID];
-
-  if (AQ.empty()) {
-    MoveUnits(Pending[QID], AQ);
-  }
-  if (!AQ.empty()) {
-    SU = AQ.back();
-    AQ.resize(AQ.size() - 1);
-  }
-  return SU;
-}
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
deleted file mode 100644
index fc5b95c28e7..00000000000
--- a/lib/Target/R600/R600MachineScheduler.h
+++ /dev/null
@@ -1,103 +0,0 @@
-//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief R600 Machine Scheduler interface
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H
-#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H
-
-#include "R600InstrInfo.h"
-#include "llvm/ADT/PriorityQueue.h"
-#include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/Support/Debug.h"
-
-using namespace llvm;
-
-namespace llvm {
-
-class R600SchedStrategy : public MachineSchedStrategy {
-
-  const ScheduleDAGMILive *DAG;
-  const R600InstrInfo *TII;
-  const R600RegisterInfo *TRI;
-  MachineRegisterInfo *MRI;
-
-  enum InstKind {
-    IDAlu,
-    IDFetch,
-    IDOther,
-    IDLast
-  };
-
-  enum AluKind {
-    AluAny,
-    AluT_X,
-    AluT_Y,
-    AluT_Z,
-    AluT_W,
-    AluT_XYZW,
-    AluPredX,
-    AluTrans,
-    AluDiscarded, // LLVM Instructions that are going to be eliminated
-    AluLast
-  };
-
-  std::vector<SUnit *> Available[IDLast], Pending[IDLast];
-  std::vector<SUnit *> AvailableAlus[AluLast];
-  std::vector<SUnit *> PhysicalRegCopy;
-
-  InstKind CurInstKind;
-  int CurEmitted;
-  InstKind NextInstKind;
-
-  unsigned AluInstCount;
-  unsigned FetchInstCount;
-
-  int InstKindLimit[IDLast];
-
-  int OccupedSlotsMask;
-
-public:
-  R600SchedStrategy() :
-    DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) {
-  }
-
-  virtual ~R600SchedStrategy() {}
-
-  void initialize(ScheduleDAGMI *dag) override;
-  SUnit *pickNode(bool &IsTopNode) override;
-  void schedNode(SUnit *SU, bool IsTopNode) override;
-  void releaseTopNode(SUnit *SU) override;
-  void releaseBottomNode(SUnit *SU) override;
-
-private:
-  std::vector<MachineInstr *> InstructionsGroupCandidate;
-  bool VLIW5;
-
-  int getInstKind(SUnit *SU);
-  bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
-  AluKind getAluKind(SUnit *SU) const;
-  void LoadAlu();
-  unsigned AvailablesAluCount() const;
-  SUnit *AttemptFillSlot (unsigned Slot, bool AnyAlu);
-  void PrepareNextSlot();
-  SUnit *PopInst(std::vector<SUnit*> &Q, bool AnyALU);
-
-  void AssignSlot(MachineInstr *MI, unsigned Slot);
-  SUnit* pickAlu();
-  SUnit* pickOther(int QID);
-  void MoveUnits(std::vector<SUnit *> &QSrc, std::vector<SUnit *> &QDst);
-};
-
-} // namespace llvm
-
-#endif /* R600MACHINESCHEDULER_H_ */
diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
deleted file mode 100644
index 0c06ccc736d..00000000000
--- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp
+++ /dev/null
@@ -1,382 +0,0 @@
-//===--------------------- R600MergeVectorRegisters.cpp -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass merges inputs of swizzeable instructions into vector sharing
-/// common data and/or have enough undef subreg using swizzle abilities.
-///
-/// For instance let's consider the following pseudo code :
-/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
-/// ...
-/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3
-/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3
-///
-/// is turned into :
-/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
-/// ...
-/// vreg7<def> = INSERT_SUBREG vreg4, sub3
-/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3
-///
-/// This allow regalloc to reduce register pressure for vector registers and
-/// to reduce MOV count.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600InstrInfo.h"
-#include "llvm/CodeGen/DFAPacketizer.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "vec-merger"
-
-namespace {
-
-static bool
-isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
-  for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg),
-      E = MRI.def_instr_end(); It != E; ++It) {
-    return (*It).isImplicitDef();
-  }
-  if (MRI.isReserved(Reg)) {
-    return false;
-  }
-  llvm_unreachable("Reg without a def");
-  return false;
-}
-
-class RegSeqInfo {
-public:
-  MachineInstr *Instr;
-  DenseMap<unsigned, unsigned> RegToChan;
-  std::vector<unsigned> UndefReg;
-  RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
-    assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE);
-    for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
-      MachineOperand &MO = Instr->getOperand(i);
-      unsigned Chan = Instr->getOperand(i + 1).getImm();
-      if (isImplicitlyDef(MRI, MO.getReg()))
-        UndefReg.push_back(Chan);
-      else
-        RegToChan[MO.getReg()] = Chan;
-    }
-  }
-  RegSeqInfo() {}
-
-  bool operator==(const RegSeqInfo &RSI) const {
-    return RSI.Instr == Instr;
-  }
-};
-
-class R600VectorRegMerger : public MachineFunctionPass {
-private:
-  MachineRegisterInfo *MRI;
-  const R600InstrInfo *TII;
-  bool canSwizzle(const MachineInstr &) const;
-  bool areAllUsesSwizzeable(unsigned Reg) const;
-  void SwizzleInput(MachineInstr &,
-      const std::vector<std::pair<unsigned, unsigned> > &) const;
-  bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *,
-      std::vector<std::pair<unsigned, unsigned> > &Remap) const;
-  bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
-      std::vector<std::pair<unsigned, unsigned> > &RemapChan);
-  bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
-      std::vector<std::pair<unsigned, unsigned> > &RemapChan);
-  MachineInstr *RebuildVector(RegSeqInfo *MI,
-      const RegSeqInfo *BaseVec,
-      const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const;
-  void RemoveMI(MachineInstr *);
-  void trackRSI(const RegSeqInfo &RSI);
-
-  typedef DenseMap<unsigned, std::vector<MachineInstr *> > InstructionSetMap;
-  DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq;
-  InstructionSetMap PreviousRegSeqByReg;
-  InstructionSetMap PreviousRegSeqByUndefCount;
-public:
-  static char ID;
-  R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
-  TII(nullptr) { }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
-    AU.addRequired<MachineLoopInfo>();
-    AU.addPreserved<MachineLoopInfo>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  const char *getPassName() const override {
-    return "R600 Vector Registers Merge Pass";
-  }
-
-  bool runOnMachineFunction(MachineFunction &Fn) override;
-};
-
-char R600VectorRegMerger::ID = 0;
-
-bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
-    const {
-  if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
-    return true;
-  switch (MI.getOpcode()) {
-  case AMDGPU::R600_ExportSwz:
-  case AMDGPU::EG_ExportSwz:
-    return true;
-  default:
-    return false;
-  }
-}
-
-bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
-    RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned> > &Remap)
-    const {
-  unsigned CurrentUndexIdx = 0;
-  for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(),
-      E = ToMerge->RegToChan.end(); It != E; ++It) {
-    DenseMap<unsigned, unsigned>::const_iterator PosInUntouched =
-        Untouched->RegToChan.find((*It).first);
-    if (PosInUntouched != Untouched->RegToChan.end()) {
-      Remap.push_back(std::pair<unsigned, unsigned>
-          ((*It).second, (*PosInUntouched).second));
-      continue;
-    }
-    if (CurrentUndexIdx >= Untouched->UndefReg.size())
-      return false;
-    Remap.push_back(std::pair<unsigned, unsigned>
-        ((*It).second, Untouched->UndefReg[CurrentUndexIdx++]));
-  }
-
-  return true;
-}
-
-static
-unsigned getReassignedChan(
-    const std::vector<std::pair<unsigned, unsigned> > &RemapChan,
-    unsigned Chan) {
-  for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
-    if (RemapChan[j].first == Chan)
-      return RemapChan[j].second;
-  }
-  llvm_unreachable("Chan wasn't reassigned");
-}
-
-MachineInstr *R600VectorRegMerger::RebuildVector(
-    RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
-    const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
-  unsigned Reg = RSI->Instr->getOperand(0).getReg();
-  MachineBasicBlock::iterator Pos = RSI->Instr;
-  MachineBasicBlock &MBB = *Pos->getParent();
-  DebugLoc DL = Pos->getDebugLoc();
-
-  unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg();
-  DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
-  std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
-  for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
-      E = RSI->RegToChan.end(); It != E; ++It) {
-    unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
-    unsigned SubReg = (*It).first;
-    unsigned Swizzle = (*It).second;
-    unsigned Chan = getReassignedChan(RemapChan, Swizzle);
-
-    MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG),
-        DstReg)
-        .addReg(SrcVec)
-        .addReg(SubReg)
-        .addImm(Chan);
-    UpdatedRegToChan[SubReg] = Chan;
-    std::vector<unsigned>::iterator ChanPos =
-        std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan);
-    if (ChanPos != UpdatedUndef.end())
-      UpdatedUndef.erase(ChanPos);
-    assert(std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan) ==
-               UpdatedUndef.end() &&
-           "UpdatedUndef shouldn't contain Chan more than once!");
-    DEBUG(dbgs() << "    ->"; Tmp->dump(););
-    (void)Tmp;
-    SrcVec = DstReg;
-  }
-  Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg)
-      .addReg(SrcVec);
-  DEBUG(dbgs() << "    ->"; Pos->dump(););
-
-  DEBUG(dbgs() << "  Updating Swizzle:\n");
-  for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
-      E = MRI->use_instr_end(); It != E; ++It) {
-    DEBUG(dbgs() << "    ";(*It).dump(); dbgs() << "    ->");
-    SwizzleInput(*It, RemapChan);
-    DEBUG((*It).dump());
-  }
-  RSI->Instr->eraseFromParent();
-
-  // Update RSI
-  RSI->Instr = Pos;
-  RSI->RegToChan = UpdatedRegToChan;
-  RSI->UndefReg = UpdatedUndef;
-
-  return Pos;
-}
-
-void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
-  for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
-      E = PreviousRegSeqByReg.end(); It != E; ++It) {
-    std::vector<MachineInstr *> &MIs = (*It).second;
-    MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
-  }
-  for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(),
-      E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
-    std::vector<MachineInstr *> &MIs = (*It).second;
-    MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
-  }
-}
-
-void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
-    const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
-  unsigned Offset;
-  if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
-    Offset = 2;
-  else
-    Offset = 3;
-  for (unsigned i = 0; i < 4; i++) {
-    unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1;
-    for (unsigned j = 0, e = RemapChan.size(); j < e; j++) {
-      if (RemapChan[j].first == Swizzle) {
-        MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1);
-        break;
-      }
-    }
-  }
-}
-
-bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const {
-  for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
-      E = MRI->use_instr_end(); It != E; ++It) {
-    if (!canSwizzle(*It))
-      return false;
-  }
-  return true;
-}
-
-bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI,
-    RegSeqInfo &CompatibleRSI,
-    std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
-  for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(),
-      MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) {
-    if (!MOp->isReg())
-      continue;
-    if (PreviousRegSeqByReg[MOp->getReg()].empty())
-      continue;
-    for (MachineInstr *MI : PreviousRegSeqByReg[MOp->getReg()]) {
-      CompatibleRSI = PreviousRegSeq[MI];
-      if (RSI == CompatibleRSI)
-        continue;
-      if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan))
-        return true;
-    }
-  }
-  return false;
-}
-
-bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
-    RegSeqInfo &CompatibleRSI,
-    std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
-  unsigned NeededUndefs = 4 - RSI.UndefReg.size();
-  if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
-    return false;
-  std::vector<MachineInstr *> &MIs =
-      PreviousRegSeqByUndefCount[NeededUndefs];
-  CompatibleRSI = PreviousRegSeq[MIs.back()];
-  tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
-  return true;
-}
-
-void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
-  for (DenseMap<unsigned, unsigned>::const_iterator
-  It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) {
-    PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr);
-  }
-  PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr);
-  PreviousRegSeq[RSI.Instr] = RSI;
-}
-
-bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
-  TII = static_cast<const R600InstrInfo *>(Fn.getSubtarget().getInstrInfo());
-  MRI = &(Fn.getRegInfo());
-  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
-       MBB != MBBe; ++MBB) {
-    MachineBasicBlock *MB = MBB;
-    PreviousRegSeq.clear();
-    PreviousRegSeqByReg.clear();
-    PreviousRegSeqByUndefCount.clear();
-
-    for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
-         MII != MIIE; ++MII) {
-      MachineInstr *MI = MII;
-      if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) {
-        if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
-          unsigned Reg = MI->getOperand(1).getReg();
-          for (MachineRegisterInfo::def_instr_iterator
-               It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end();
-               It != E; ++It) {
-            RemoveMI(&(*It));
-          }
-        }
-        continue;
-      }
-
-
-      RegSeqInfo RSI(*MRI, MI);
-
-      // All uses of MI are swizzeable ?
-      unsigned Reg = MI->getOperand(0).getReg();
-      if (!areAllUsesSwizzeable(Reg))
-        continue;
-
-      DEBUG (dbgs() << "Trying to optimize ";
-          MI->dump();
-      );
-
-      RegSeqInfo CandidateRSI;
-      std::vector<std::pair<unsigned, unsigned> > RemapChan;
-      DEBUG(dbgs() << "Using common slots...\n";);
-      if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
-        // Remove CandidateRSI mapping
-        RemoveMI(CandidateRSI.Instr);
-        MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
-        trackRSI(RSI);
-        continue;
-      }
-      DEBUG(dbgs() << "Using free slots...\n";);
-      RemapChan.clear();
-      if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) {
-        RemoveMI(CandidateRSI.Instr);
-        MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
-        trackRSI(RSI);
-        continue;
-      }
-      //Failed to merge
-      trackRSI(RSI);
-    }
-  }
-  return false;
-}
-
-}
-
-llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) {
-  return new R600VectorRegMerger(tm);
-}
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
deleted file mode 100644
index deee5bc3997..00000000000
--- a/lib/Target/R600/R600Packetizer.cpp
+++ /dev/null
@@ -1,408 +0,0 @@
-//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass implements instructions packetization for R600. It unsets isLast
-/// bit of instructions inside a bundle and substitutes src register with
-/// PreviousVector when applicable.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/Debug.h"
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600InstrInfo.h"
-#include "llvm/CodeGen/DFAPacketizer.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "packets"
-
-namespace {
-
-class R600Packetizer : public MachineFunctionPass {
-
-public:
-  static char ID;
-  R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
-    AU.addRequired<MachineLoopInfo>();
-    AU.addPreserved<MachineLoopInfo>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  const char *getPassName() const override {
-    return "R600 Packetizer";
-  }
-
-  bool runOnMachineFunction(MachineFunction &Fn) override;
-};
-char R600Packetizer::ID = 0;
-
-class R600PacketizerList : public VLIWPacketizerList {
-
-private:
-  const R600InstrInfo *TII;
-  const R600RegisterInfo &TRI;
-  bool VLIW5;
-  bool ConsideredInstUsesAlreadyWrittenVectorElement;
-
-  unsigned getSlot(const MachineInstr *MI) const {
-    return TRI.getHWRegChan(MI->getOperand(0).getReg());
-  }
-
-  /// \returns register to PV chan mapping for bundle/single instructions that
-  /// immediately precedes I.
-  DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I)
-      const {
-    DenseMap<unsigned, unsigned> Result;
-    I--;
-    if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle())
-      return Result;
-    MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
-    if (I->isBundle())
-      BI++;
-    int LastDstChan = -1;
-    do {
-      bool isTrans = false;
-      int BISlot = getSlot(BI);
-      if (LastDstChan >= BISlot)
-        isTrans = true;
-      LastDstChan = BISlot;
-      if (TII->isPredicated(BI))
-        continue;
-      int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
-      if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
-        continue;
-      int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst);
-      if (DstIdx == -1) {
-        continue;
-      }
-      unsigned Dst = BI->getOperand(DstIdx).getReg();
-      if (isTrans || TII->isTransOnly(BI)) {
-        Result[Dst] = AMDGPU::PS;
-        continue;
-      }
-      if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
-          BI->getOpcode() == AMDGPU::DOT4_eg) {
-        Result[Dst] = AMDGPU::PV_X;
-        continue;
-      }
-      if (Dst == AMDGPU::OQAP) {
-        continue;
-      }
-      unsigned PVReg = 0;
-      switch (TRI.getHWRegChan(Dst)) {
-      case 0:
-        PVReg = AMDGPU::PV_X;
-        break;
-      case 1:
-        PVReg = AMDGPU::PV_Y;
-        break;
-      case 2:
-        PVReg = AMDGPU::PV_Z;
-        break;
-      case 3:
-        PVReg = AMDGPU::PV_W;
-        break;
-      default:
-        llvm_unreachable("Invalid Chan");
-      }
-      Result[Dst] = PVReg;
-    } while ((++BI)->isBundledWithPred());
-    return Result;
-  }
-
-  void substitutePV(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PVs)
-      const {
-    unsigned Ops[] = {
-      AMDGPU::OpName::src0,
-      AMDGPU::OpName::src1,
-      AMDGPU::OpName::src2
-    };
-    for (unsigned i = 0; i < 3; i++) {
-      int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]);
-      if (OperandIdx < 0)
-        continue;
-      unsigned Src = MI->getOperand(OperandIdx).getReg();
-      const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src);
-      if (It != PVs.end())
-        MI->getOperand(OperandIdx).setReg(It->second);
-    }
-  }
-public:
-  // Ctor.
-  R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI)
-      : VLIWPacketizerList(MF, MLI, true),
-        TII(static_cast<const R600InstrInfo *>(
-            MF.getSubtarget().getInstrInfo())),
-        TRI(TII->getRegisterInfo()) {
-    VLIW5 = !MF.getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
-  }
-
-  // initPacketizerState - initialize some internal flags.
-  void initPacketizerState() override {
-    ConsideredInstUsesAlreadyWrittenVectorElement = false;
-  }
-
-  // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-  bool ignorePseudoInstruction(MachineInstr *MI,
-                               MachineBasicBlock *MBB) override {
-    return false;
-  }
-
-  // isSoloInstruction - return true if instruction MI can not be packetized
-  // with any other instruction, which means that MI itself is a packet.
-  bool isSoloInstruction(MachineInstr *MI) override {
-    if (TII->isVector(*MI))
-      return true;
-    if (!TII->isALUInstr(MI->getOpcode()))
-      return true;
-    if (MI->getOpcode() == AMDGPU::GROUP_BARRIER)
-      return true;
-    // XXX: This can be removed once the packetizer properly handles all the
-    // LDS instruction group restrictions.
-    if (TII->isLDSInstr(MI->getOpcode()))
-      return true;
-    return false;
-  }
-
-  // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
-  // together.
-  bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override {
-    MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
-    if (getSlot(MII) == getSlot(MIJ))
-      ConsideredInstUsesAlreadyWrittenVectorElement = true;
-    // Does MII and MIJ share the same pred_sel ?
-    int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
-        OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel);
-    unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0,
-        PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0;
-    if (PredI != PredJ)
-      return false;
-    if (SUJ->isSucc(SUI)) {
-      for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) {
-        const SDep &Dep = SUJ->Succs[i];
-        if (Dep.getSUnit() != SUI)
-          continue;
-        if (Dep.getKind() == SDep::Anti)
-          continue;
-        if (Dep.getKind() == SDep::Output)
-          if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg())
-            continue;
-        return false;
-      }
-    }
-
-    bool ARDef = TII->definesAddressRegister(MII) ||
-                 TII->definesAddressRegister(MIJ);
-    bool ARUse = TII->usesAddressRegister(MII) ||
-                 TII->usesAddressRegister(MIJ);
-    if (ARDef && ARUse)
-      return false;
-
-    return true;
-  }
-
-  // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
-  // and SUJ.
-  bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override {
-    return false;
-  }
-
-  void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
-    unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
-    MI->getOperand(LastOp).setImm(Bit);
-  }
-
-  bool isBundlableWithCurrentPMI(MachineInstr *MI,
-                                 const DenseMap<unsigned, unsigned> &PV,
-                                 std::vector<R600InstrInfo::BankSwizzle> &BS,
-                                 bool &isTransSlot) {
-    isTransSlot = TII->isTransOnly(MI);
-    assert (!isTransSlot || VLIW5);
-
-    // Is the dst reg sequence legal ?
-    if (!isTransSlot && !CurrentPacketMIs.empty()) {
-      if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) {
-        if (ConsideredInstUsesAlreadyWrittenVectorElement  &&
-            !TII->isVectorOnly(MI) && VLIW5) {
-          isTransSlot = true;
-          DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump(););
-        }
-        else
-          return false;
-      }
-    }
-
-    // Are the Constants limitations met ?
-    CurrentPacketMIs.push_back(MI);
-    if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
-      DEBUG(
-        dbgs() << "Couldn't pack :\n";
-        MI->dump();
-        dbgs() << "with the following packets :\n";
-        for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
-          CurrentPacketMIs[i]->dump();
-          dbgs() << "\n";
-        }
-        dbgs() << "because of Consts read limitations\n";
-      );
-      CurrentPacketMIs.pop_back();
-      return false;
-    }
-
-    // Is there a BankSwizzle set that meet Read Port limitations ?
-    if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
-            PV, BS, isTransSlot)) {
-      DEBUG(
-        dbgs() << "Couldn't pack :\n";
-        MI->dump();
-        dbgs() << "with the following packets :\n";
-        for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
-          CurrentPacketMIs[i]->dump();
-          dbgs() << "\n";
-        }
-        dbgs() << "because of Read port limitations\n";
-      );
-      CurrentPacketMIs.pop_back();
-      return false;
-    }
-
-    // We cannot read LDS source registrs from the Trans slot.
-    if (isTransSlot && TII->readsLDSSrcReg(MI))
-      return false;
-
-    CurrentPacketMIs.pop_back();
-    return true;
-  }
-
-  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override {
-    MachineBasicBlock::iterator FirstInBundle =
-        CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front();
-    const DenseMap<unsigned, unsigned> &PV =
-        getPreviousVector(FirstInBundle);
-    std::vector<R600InstrInfo::BankSwizzle> BS;
-    bool isTransSlot;
-
-    if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) {
-      for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
-        MachineInstr *MI = CurrentPacketMIs[i];
-        unsigned Op = TII->getOperandIdx(MI->getOpcode(),
-            AMDGPU::OpName::bank_swizzle);
-        MI->getOperand(Op).setImm(BS[i]);
-      }
-      unsigned Op = TII->getOperandIdx(MI->getOpcode(),
-          AMDGPU::OpName::bank_swizzle);
-      MI->getOperand(Op).setImm(BS.back());
-      if (!CurrentPacketMIs.empty())
-        setIsLastBit(CurrentPacketMIs.back(), 0);
-      substitutePV(MI, PV);
-      MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI);
-      if (isTransSlot) {
-        endPacket(std::next(It)->getParent(), std::next(It));
-      }
-      return It;
-    }
-    endPacket(MI->getParent(), MI);
-    if (TII->isTransOnly(MI))
-      return MI;
-    return VLIWPacketizerList::addToPacket(MI);
-  }
-};
-
-bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
-  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
-  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
-
-  // Instantiate the packetizer.
-  R600PacketizerList Packetizer(Fn, MLI);
-
-  // DFA state table should not be empty.
-  assert(Packetizer.getResourceTracker() && "Empty DFA table!");
-
-  //
-  // Loop over all basic blocks and remove KILL pseudo-instructions
-  // These instructions confuse the dependence analysis. Consider:
-  // D0 = ...   (Insn 0)
-  // R0 = KILL R0, D0 (Insn 1)
-  // R0 = ... (Insn 2)
-  // Here, Insn 1 will result in the dependence graph not emitting an output
-  // dependence between Insn 0 and Insn 2. This can lead to incorrect
-  // packetization
-  //
-  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
-       MBB != MBBe; ++MBB) {
-    MachineBasicBlock::iterator End = MBB->end();
-    MachineBasicBlock::iterator MI = MBB->begin();
-    while (MI != End) {
-      if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF ||
-          (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) {
-        MachineBasicBlock::iterator DeleteMI = MI;
-        ++MI;
-        MBB->erase(DeleteMI);
-        End = MBB->end();
-        continue;
-      }
-      ++MI;
-    }
-  }
-
-  // Loop over all of the basic blocks.
-  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
-       MBB != MBBe; ++MBB) {
-    // Find scheduling regions and schedule / packetize each region.
-    unsigned RemainingCount = MBB->size();
-    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
-        RegionEnd != MBB->begin();) {
-      // The next region starts above the previous region. Look backward in the
-      // instruction stream until we find the nearest boundary.
-      MachineBasicBlock::iterator I = RegionEnd;
-      for(;I != MBB->begin(); --I, --RemainingCount) {
-        if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn))
-          break;
-      }
-      I = MBB->begin();
-
-      // Skip empty scheduling regions.
-      if (I == RegionEnd) {
-        RegionEnd = std::prev(RegionEnd);
-        --RemainingCount;
-        continue;
-      }
-      // Skip regions with one instruction.
-      if (I == std::prev(RegionEnd)) {
-        RegionEnd = std::prev(RegionEnd);
-        continue;
-      }
-
-      Packetizer.PacketizeMIs(MBB, I, RegionEnd);
-      RegionEnd = I;
-    }
-  }
-
-  return true;
-
-}
-
-} // end anonymous namespace
-
-llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) {
-  return new R600Packetizer(tm);
-}
diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp
deleted file mode 100644
index fb0359cfc65..00000000000
--- a/lib/Target/R600/R600RegisterInfo.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief R600 implementation of the TargetRegisterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "R600RegisterInfo.h"
-#include "AMDGPUTargetMachine.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
-
-using namespace llvm;
-
-R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() {
-  RCW.RegWeight = 0;
-  RCW.WeightLimit = 0;
-}
-
-BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  BitVector Reserved(getNumRegs());
-
-  const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
-
-  Reserved.set(AMDGPU::ZERO);
-  Reserved.set(AMDGPU::HALF);
-  Reserved.set(AMDGPU::ONE);
-  Reserved.set(AMDGPU::ONE_INT);
-  Reserved.set(AMDGPU::NEG_HALF);
-  Reserved.set(AMDGPU::NEG_ONE);
-  Reserved.set(AMDGPU::PV_X);
-  Reserved.set(AMDGPU::ALU_LITERAL_X);
-  Reserved.set(AMDGPU::ALU_CONST);
-  Reserved.set(AMDGPU::PREDICATE_BIT);
-  Reserved.set(AMDGPU::PRED_SEL_OFF);
-  Reserved.set(AMDGPU::PRED_SEL_ZERO);
-  Reserved.set(AMDGPU::PRED_SEL_ONE);
-  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
-
-  for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
-                        E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
-    Reserved.set(*I);
-  }
-
-  TII->reserveIndirectRegisters(Reserved, MF);
-
-  return Reserved;
-}
-
-unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
-  return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
-}
-
-unsigned R600RegisterInfo::getHWRegIndex(unsigned Reg) const {
-  return GET_REG_INDEX(getEncodingValue(Reg));
-}
-
-const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
-                                                                   MVT VT) const {
-  switch(VT.SimpleTy) {
-  default:
-  case MVT::i32: return &AMDGPU::R600_TReg32RegClass;
-  }
-}
-
-const RegClassWeight &R600RegisterInfo::getRegClassWeight(
-  const TargetRegisterClass *RC) const {
-  return RCW;
-}
-
-bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
-  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
-
-  switch (Reg) {
-  case AMDGPU::OQAP:
-  case AMDGPU::OQBP:
-  case AMDGPU::AR_X:
-    return false;
-  default:
-    return true;
-  }
-}
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
deleted file mode 100644
index 9713e600a72..00000000000
--- a/lib/Target/R600/R600RegisterInfo.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Interface definition for R600RegisterInfo
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H
-#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H
-
-#include "AMDGPURegisterInfo.h"
-
-namespace llvm {
-
-class AMDGPUSubtarget;
-
-struct R600RegisterInfo : public AMDGPURegisterInfo {
-  RegClassWeight RCW;
-
-  R600RegisterInfo();
-
-  BitVector getReservedRegs(const MachineFunction &MF) const override;
-
-  /// \brief get the HW encoding for a register's channel.
-  unsigned getHWRegChan(unsigned reg) const;
-
-  unsigned getHWRegIndex(unsigned Reg) const override;
-
-  /// \brief get the register class of the specified type to use in the
-  /// CFGStructurizer
-  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
-
-  const RegClassWeight &
-    getRegClassWeight(const TargetRegisterClass *RC) const override;
-
-  // \returns true if \p Reg can be defined in one ALU caluse and used in another.
-  bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
deleted file mode 100644
index cc667d985a8..00000000000
--- a/lib/Target/R600/R600RegisterInfo.td
+++ /dev/null
@@ -1,252 +0,0 @@
-
-class R600Reg <string name, bits<16> encoding> : Register<name> {
-  let Namespace = "AMDGPU";
-  let HWEncoding = encoding;
-}
-
-class R600RegWithChan <string name, bits<9> sel, string chan> :
-    Register <name> {
-
-  field bits<2> chan_encoding = !if(!eq(chan, "X"), 0,
-                                !if(!eq(chan, "Y"), 1,
-                                !if(!eq(chan, "Z"), 2,
-                                !if(!eq(chan, "W"), 3, 0))));
-  let HWEncoding{8-0}  = sel;
-  let HWEncoding{10-9} = chan_encoding;
-  let Namespace = "AMDGPU";
-}
-
-class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
-    RegisterWithSubRegs<n, subregs> {
-  field bits<2> chan_encoding = 0;
-  let Namespace = "AMDGPU";
-  let SubRegIndices = [sub0, sub1, sub2, sub3];
-  let HWEncoding{8-0} = encoding{8-0};
-  let HWEncoding{10-9} = chan_encoding;
-}
-
-class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
-    RegisterWithSubRegs<n, subregs> {
-  field bits<2> chan_encoding = 0;
-  let Namespace = "AMDGPU";
-  let SubRegIndices = [sub0, sub1];
-  let HWEncoding = encoding;
-  let HWEncoding{8-0} = encoding{8-0};
-  let HWEncoding{10-9} = chan_encoding;
-}
-
-class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 <
-  "V"#lo#hi#"_"#chan,
-  [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)],
-  lo
->;
-
-foreach Index = 0-127 in {
-  foreach Chan = [ "X", "Y", "Z", "W" ] in {
-    // 32-bit Temporary Registers
-    def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>;
-
-    // Indirect addressing offset registers
-    def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan,
-                                              Index, Chan>;
-  }
-  // 128-bit Temporary Registers
-  def T#Index#_XYZW : R600Reg_128 <"T"#Index#"",
-                                   [!cast<Register>("T"#Index#"_X"),
-                                    !cast<Register>("T"#Index#"_Y"),
-                                    !cast<Register>("T"#Index#"_Z"),
-                                    !cast<Register>("T"#Index#"_W")],
-                                   Index>;
-
-  def T#Index#_XY : R600Reg_64 <"T"#Index#"",
-                                   [!cast<Register>("T"#Index#"_X"),
-                                    !cast<Register>("T"#Index#"_Y")],
-                                   Index>;
-}
-
-foreach Chan = [ "X", "Y", "Z", "W"] in {
-
-  let chan_encoding = !if(!eq(Chan, "X"), 0,
-                      !if(!eq(Chan, "Y"), 1,
-                      !if(!eq(Chan, "Z"), 2,
-                      !if(!eq(Chan, "W"), 3, 0)))) in {
-    def V0123_#Chan : R600Reg_128 <"V0123_"#Chan,
-                                   [!cast<Register>("T0_"#Chan),
-                                    !cast<Register>("T1_"#Chan),
-                                    !cast<Register>("T2_"#Chan),
-                                    !cast<Register>("T3_"#Chan)],
-                                    0>;
-    def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>;
-    def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>;
-  }
-}
-
-
-// KCACHE_BANK0
-foreach Index = 159-128 in {
-  foreach Chan = [ "X", "Y", "Z", "W" ] in {
-    // 32-bit Temporary Registers
-    def KC0_#Index#_#Chan : R600RegWithChan <"KC0["#!add(Index,-128)#"]."#Chan, Index, Chan>;
-  }
-  // 128-bit Temporary Registers
-  def KC0_#Index#_XYZW : R600Reg_128 <"KC0["#!add(Index, -128)#"].XYZW",
-                                 [!cast<Register>("KC0_"#Index#"_X"),
-                                  !cast<Register>("KC0_"#Index#"_Y"),
-                                  !cast<Register>("KC0_"#Index#"_Z"),
-                                  !cast<Register>("KC0_"#Index#"_W")],
-                                 Index>;
-}
-
-// KCACHE_BANK1
-foreach Index = 191-160 in {
-  foreach Chan = [ "X", "Y", "Z", "W" ] in {
-    // 32-bit Temporary Registers
-    def KC1_#Index#_#Chan : R600RegWithChan <"KC1["#!add(Index,-160)#"]."#Chan, Index, Chan>;
-  }
-  // 128-bit Temporary Registers
-  def KC1_#Index#_XYZW : R600Reg_128 <"KC1["#!add(Index, -160)#"].XYZW",
-                                 [!cast<Register>("KC1_"#Index#"_X"),
-                                  !cast<Register>("KC1_"#Index#"_Y"),
-                                  !cast<Register>("KC1_"#Index#"_Z"),
-                                  !cast<Register>("KC1_"#Index#"_W")],
-                                 Index>;
-}
-
-
-// Array Base Register holding input in FS
-foreach Index = 448-480 in {
-  def ArrayBase#Index :  R600Reg<"ARRAY_BASE", Index>;
-}
-
-
-// Special Registers
-
-def OQA : R600Reg<"OQA", 219>;
-def OQB : R600Reg<"OQB", 220>;
-def OQAP : R600Reg<"OQAP", 221>;
-def OQBP : R600Reg<"OQAP", 222>;
-def LDS_DIRECT_A : R600Reg<"LDS_DIRECT_A", 223>;
-def LDS_DIRECT_B : R600Reg<"LDS_DIRECT_B", 224>;
-def ZERO : R600Reg<"0.0", 248>;
-def ONE : R600Reg<"1.0", 249>;
-def NEG_ONE : R600Reg<"-1.0", 249>;
-def ONE_INT : R600Reg<"1", 250>;
-def HALF : R600Reg<"0.5", 252>;
-def NEG_HALF : R600Reg<"-0.5", 252>;
-def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">;
-def ALU_LITERAL_Y : R600RegWithChan<"literal.y", 253, "Y">;
-def ALU_LITERAL_Z : R600RegWithChan<"literal.z", 253, "Z">;
-def ALU_LITERAL_W : R600RegWithChan<"literal.w", 253, "W">;
-def PV_X : R600RegWithChan<"PV.X", 254, "X">;
-def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">;
-def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">;
-def PV_W : R600RegWithChan<"PV.W", 254, "W">;
-def PS: R600Reg<"PS", 255>;
-def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
-def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
-def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
-def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
-def AR_X : R600Reg<"AR.x", 0>;
-
-def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
-                          (add (sequence "ArrayBase%u", 448, 480))>;
-// special registers for ALU src operands
-// const buffer reference, SRCx_SEL contains index
-def ALU_CONST : R600Reg<"CBuf", 0>;
-// interpolation param reference, SRCx_SEL contains index
-def ALU_PARAM : R600Reg<"Param", 0>;
-
-let isAllocatable = 0 in {
-
-def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>;
-
-// We only use Addr_[YZW] for vertical vectors.
-// FIXME if we add more vertical vector registers we will need to ad more
-// registers to these classes.
-def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>;
-def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>;
-def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>;
-
-def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
-  (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
-
-def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32,
-                              (add (sequence "KC0_%u_X", 128, 159))>;
-
-def R600_KC0_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
-                              (add (sequence "KC0_%u_Y", 128, 159))>;
-
-def R600_KC0_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
-                              (add (sequence "KC0_%u_Z", 128, 159))>;
-
-def R600_KC0_W : RegisterClass <"AMDGPU", [f32, i32], 32,
-                              (add (sequence "KC0_%u_W", 128, 159))>;
-
-def R600_KC0 : RegisterClass <"AMDGPU", [f32, i32], 32,
-                                   (interleave R600_KC0_X, R600_KC0_Y,
-                                               R600_KC0_Z, R600_KC0_W)>;
-
-def R600_KC1_X : RegisterClass <"AMDGPU", [f32, i32], 32,
-                              (add (sequence "KC1_%u_X", 160, 191))>;
-
-def R600_KC1_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
-                              (add (sequence "KC1_%u_Y", 160, 191))>;
-
-def R600_KC1_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
-                              (add (sequence "KC1_%u_Z", 160, 191))>;
-
-def R600_KC1_W : RegisterClass <"AMDGPU", [f32, i32], 32,
-                              (add (sequence "KC1_%u_W", 160, 191))>;
-
-def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32,
-                                   (interleave R600_KC1_X, R600_KC1_Y,
-                                               R600_KC1_Z, R600_KC1_W)>;
-
-} // End isAllocatable = 0
-
-def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
-                                   (add (sequence "T%u_X", 0, 127), AR_X)>;
-
-def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
-                                   (add (sequence "T%u_Y", 0, 127))>;
-
-def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
-                                   (add (sequence "T%u_Z", 0, 127))>;
-
-def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32,
-                                   (add (sequence "T%u_W", 0, 127))>;
-
-def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
-                                   (interleave R600_TReg32_X, R600_TReg32_Y,
-                                               R600_TReg32_Z, R600_TReg32_W)>;
-
-def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
-    R600_TReg32,
-    R600_ArrayBase,
-    R600_Addr,
-    R600_KC0, R600_KC1,
-    ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
-    ALU_CONST, ALU_PARAM, OQAP
-    )>;
-
-def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
-    PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>;
-
-def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add
-    PREDICATE_BIT)>;
-
-def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
-                                (add (sequence "T%u_XYZW", 0, 127))> {
-  let CopyCost = -1;
-}
-
-def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
-  (add V0123_W, V0123_Z, V0123_Y, V0123_X)
->;
-
-def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
-                                (add (sequence "T%u_XY", 0, 63))>;
-
-def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
-                                      (add V01_X, V01_Y, V01_Z, V01_W,
-                                           V23_X, V23_Y, V23_Z, V23_W)>;
diff --git a/lib/Target/R600/R600Schedule.td b/lib/Target/R600/R600Schedule.td
deleted file mode 100644
index df62bf85c0a..00000000000
--- a/lib/Target/R600/R600Schedule.td
+++ /dev/null
@@ -1,49 +0,0 @@
-//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// R600 has a VLIW architecture.  On pre-cayman cards there are 5 instruction
-// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS.  For cayman cards, the TRANS
-// slot has been removed. 
-//
-//===----------------------------------------------------------------------===//
-
-
-def ALU_X : FuncUnit;
-def ALU_Y : FuncUnit;
-def ALU_Z : FuncUnit;
-def ALU_W : FuncUnit;
-def TRANS : FuncUnit;
-
-def AnyALU : InstrItinClass;
-def VecALU : InstrItinClass;
-def TransALU : InstrItinClass;
-def XALU : InstrItinClass;
-
-def R600_VLIW5_Itin : ProcessorItineraries <
-  [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
-  [],
-  [
-    InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
-    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
-    InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>,
-    InstrItinData<XALU, [InstrStage<1, [ALU_X]>]>,
-    InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
-  ]
->;
-
-def R600_VLIW4_Itin : ProcessorItineraries <
-  [ALU_X, ALU_Y, ALU_Z, ALU_W, ALU_NULL],
-  [],
-  [
-    InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
-    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
-    InstrItinData<TransALU, [InstrStage<1, [ALU_NULL]>]>,
-    InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
-  ]
->;
diff --git a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
deleted file mode 100644
index 2fc7b02f673..00000000000
--- a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
+++ /dev/null
@@ -1,303 +0,0 @@
-//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass translates tgsi-like texture intrinsics into R600 texture
-/// closer to hardware intrinsics.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-
-using namespace llvm;
-
-namespace {
-class R600TextureIntrinsicsReplacer :
-    public FunctionPass, public InstVisitor<R600TextureIntrinsicsReplacer> {
-  static char ID;
-
-  Module *Mod;
-  Type *FloatType;
-  Type *Int32Type;
-  Type *V4f32Type;
-  Type *V4i32Type;
-  FunctionType *TexSign;
-  FunctionType *TexQSign;
-
-  void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD,
-                                      unsigned SrcSelect[4], unsigned CT[4],
-                                      bool &useShadowVariant) {
-    enum TextureTypes {
-      TEXTURE_1D = 1,
-      TEXTURE_2D,
-      TEXTURE_3D,
-      TEXTURE_CUBE,
-      TEXTURE_RECT,
-      TEXTURE_SHADOW1D,
-      TEXTURE_SHADOW2D,
-      TEXTURE_SHADOWRECT,
-      TEXTURE_1D_ARRAY,
-      TEXTURE_2D_ARRAY,
-      TEXTURE_SHADOW1D_ARRAY,
-      TEXTURE_SHADOW2D_ARRAY,
-      TEXTURE_SHADOWCUBE,
-      TEXTURE_2D_MSAA,
-      TEXTURE_2D_ARRAY_MSAA,
-      TEXTURE_CUBE_ARRAY,
-      TEXTURE_SHADOWCUBE_ARRAY
-    };
-
-    switch (TextureType) {
-    case 0:
-      useShadowVariant = false;
-      return;
-    case TEXTURE_RECT:
-    case TEXTURE_1D:
-    case TEXTURE_2D:
-    case TEXTURE_3D:
-    case TEXTURE_CUBE:
-    case TEXTURE_1D_ARRAY:
-    case TEXTURE_2D_ARRAY:
-    case TEXTURE_CUBE_ARRAY:
-    case TEXTURE_2D_MSAA:
-    case TEXTURE_2D_ARRAY_MSAA:
-      useShadowVariant = false;
-      break;
-    case TEXTURE_SHADOW1D:
-    case TEXTURE_SHADOW2D:
-    case TEXTURE_SHADOWRECT:
-    case TEXTURE_SHADOW1D_ARRAY:
-    case TEXTURE_SHADOW2D_ARRAY:
-    case TEXTURE_SHADOWCUBE:
-    case TEXTURE_SHADOWCUBE_ARRAY:
-      useShadowVariant = true;
-      break;
-    default:
-      llvm_unreachable("Unknow Texture Type");
-    }
-
-    if (TextureType == TEXTURE_RECT ||
-        TextureType == TEXTURE_SHADOWRECT) {
-      CT[0] = 0;
-      CT[1] = 0;
-    }
-
-    if (TextureType == TEXTURE_CUBE_ARRAY ||
-        TextureType == TEXTURE_SHADOWCUBE_ARRAY)
-      CT[2] = 0;
-
-    if (TextureType == TEXTURE_1D_ARRAY ||
-        TextureType == TEXTURE_SHADOW1D_ARRAY) {
-      if (hasLOD && useShadowVariant) {
-        CT[1] = 0;
-      } else {
-        CT[2] = 0;
-        SrcSelect[2] = 1;
-      }
-    } else if (TextureType == TEXTURE_2D_ARRAY ||
-        TextureType == TEXTURE_SHADOW2D_ARRAY) {
-      CT[2] = 0;
-    }
-
-    if ((TextureType == TEXTURE_SHADOW1D ||
-        TextureType == TEXTURE_SHADOW2D ||
-        TextureType == TEXTURE_SHADOWRECT ||
-        TextureType == TEXTURE_SHADOW1D_ARRAY) &&
-        !(hasLOD && useShadowVariant))
-      SrcSelect[3] = 2;
-  }
-
-  void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name,
-                       unsigned SrcSelect[4], Value *Offset[3], Value *Resource,
-                       Value *Sampler, unsigned CT[4], Value *Coord) {
-    IRBuilder<> Builder(&I);
-    Constant *Mask[] = {
-      ConstantInt::get(Int32Type, SrcSelect[0]),
-      ConstantInt::get(Int32Type, SrcSelect[1]),
-      ConstantInt::get(Int32Type, SrcSelect[2]),
-      ConstantInt::get(Int32Type, SrcSelect[3])
-    };
-    Value *SwizzleMask = ConstantVector::get(Mask);
-    Value *SwizzledCoord =
-        Builder.CreateShuffleVector(Coord, Coord, SwizzleMask);
-
-    Value *Args[] = {
-      SwizzledCoord,
-      Offset[0],
-      Offset[1],
-      Offset[2],
-      Resource,
-      Sampler,
-      ConstantInt::get(Int32Type, CT[0]),
-      ConstantInt::get(Int32Type, CT[1]),
-      ConstantInt::get(Int32Type, CT[2]),
-      ConstantInt::get(Int32Type, CT[3])
-    };
-
-    Function *F = Mod->getFunction(Name);
-    if (!F) {
-      F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod);
-      F->addFnAttr(Attribute::ReadNone);
-    }
-    I.replaceAllUsesWith(Builder.CreateCall(F, Args));
-    I.eraseFromParent();
-  }
-
-  void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT,
-                           const char *VanillaInt,
-                           const char *ShadowInt) {
-    Value *Coord = I.getArgOperand(0);
-    Value *ResourceId = I.getArgOperand(1);
-    Value *SamplerId = I.getArgOperand(2);
-
-    unsigned TextureType =
-        cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
-
-    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
-    unsigned CT[4] = {1, 1, 1, 1};
-    Value *Offset[3] = {
-      ConstantInt::get(Int32Type, 0),
-      ConstantInt::get(Int32Type, 0),
-      ConstantInt::get(Int32Type, 0)
-    };
-    bool useShadowVariant;
-
-    getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT,
-                                   useShadowVariant);
-
-    ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect,
-                    Offset, ResourceId, SamplerId, CT, Coord);
-  }
-
-  void ReplaceTXF(CallInst &I) {
-    Value *Coord = I.getArgOperand(0);
-    Value *ResourceId = I.getArgOperand(4);
-    Value *SamplerId = I.getArgOperand(5);
-
-    unsigned TextureType =
-        cast<ConstantInt>(I.getArgOperand(6))->getZExtValue();
-
-    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
-    unsigned CT[4] = {1, 1, 1, 1};
-    Value *Offset[3] = {
-      I.getArgOperand(1),
-      I.getArgOperand(2),
-      I.getArgOperand(3),
-    };
-    bool useShadowVariant;
-
-    getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT,
-                                   useShadowVariant);
-
-    ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect,
-                    Offset, ResourceId, SamplerId, CT, Coord);
-  }
-
-public:
-  R600TextureIntrinsicsReplacer():
-    FunctionPass(ID) {
-  }
-
-  bool doInitialization(Module &M) override {
-    LLVMContext &Ctx = M.getContext();
-    Mod = &M;
-    FloatType = Type::getFloatTy(Ctx);
-    Int32Type = Type::getInt32Ty(Ctx);
-    V4f32Type = VectorType::get(FloatType, 4);
-    V4i32Type = VectorType::get(Int32Type, 4);
-    Type *ArgsType[] = {
-      V4f32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-    };
-    TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false);
-    Type *ArgsQType[] = {
-      V4i32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-    };
-    TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false);
-    return false;
-  }
-
-  bool runOnFunction(Function &F) override {
-    visit(F);
-    return false;
-  }
-
-  const char *getPassName() const override {
-    return "R600 Texture Intrinsics Replacer";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-  }
-
-  void visitCallInst(CallInst &I) {
-    if (!I.getCalledFunction())
-      return;
-
-    StringRef Name = I.getCalledFunction()->getName();
-    if (Name == "llvm.AMDGPU.tex") {
-      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txl") {
-      ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txl", "llvm.R600.txlc");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txb") {
-      ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txb", "llvm.R600.txbc");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txf") {
-      ReplaceTXF(I);
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txq") {
-      ReplaceTexIntrinsic(I, false, TexQSign, "llvm.R600.txq", "llvm.R600.txq");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.ddx") {
-      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddx", "llvm.R600.ddx");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.ddy") {
-      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddy", "llvm.R600.ddy");
-      return;
-    }
-  }
-
-};
-
-char R600TextureIntrinsicsReplacer::ID = 0;
-
-}
-
-FunctionPass *llvm::createR600TextureIntrinsicsReplacer() {
-  return new R600TextureIntrinsicsReplacer();
-}
diff --git a/lib/Target/R600/R700Instructions.td b/lib/Target/R600/R700Instructions.td
deleted file mode 100644
index 613a0d729bb..00000000000
--- a/lib/Target/R600/R700Instructions.td
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-- R700Instructions.td - R700 Instruction defs  -------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TableGen definitions for instructions which are:
-// - Available to R700 and newer VLIW4/VLIW5 GPUs
-// - Available only on R700 family GPUs.
-//
-//===----------------------------------------------------------------------===//
-
-def isR700 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::R700">;
-
-let Predicates = [isR700] in {
-  def SIN_r700 : SIN_Common<0x6E>;
-  def COS_r700 : COS_Common<0x6F>;
-}
diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp
deleted file mode 100644
index ccfbf1bf19e..00000000000
--- a/lib/Target/R600/SIAnnotateControlFlow.cpp
+++ /dev/null
@@ -1,365 +0,0 @@
-//===-- SIAnnotateControlFlow.cpp -  ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Annotates the control flow with hardware specific intrinsics.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-annotate-control-flow"
-
-namespace {
-
-// Complex types used in this pass
-typedef std::pair<BasicBlock *, Value *> StackEntry;
-typedef SmallVector<StackEntry, 16> StackVector;
-
-// Intrinsic names the control flow is annotated with
-static const char *const IfIntrinsic = "llvm.SI.if";
-static const char *const ElseIntrinsic = "llvm.SI.else";
-static const char *const BreakIntrinsic = "llvm.SI.break";
-static const char *const IfBreakIntrinsic = "llvm.SI.if.break";
-static const char *const ElseBreakIntrinsic = "llvm.SI.else.break";
-static const char *const LoopIntrinsic = "llvm.SI.loop";
-static const char *const EndCfIntrinsic = "llvm.SI.end.cf";
-
-class SIAnnotateControlFlow : public FunctionPass {
-
-  static char ID;
-
-  Type *Boolean;
-  Type *Void;
-  Type *Int64;
-  Type *ReturnStruct;
-
-  ConstantInt *BoolTrue;
-  ConstantInt *BoolFalse;
-  UndefValue *BoolUndef;
-  Constant *Int64Zero;
-
-  Constant *If;
-  Constant *Else;
-  Constant *Break;
-  Constant *IfBreak;
-  Constant *ElseBreak;
-  Constant *Loop;
-  Constant *EndCf;
-
-  DominatorTree *DT;
-  StackVector Stack;
-
-  LoopInfo *LI;
-
-  bool isTopOfStack(BasicBlock *BB);
-
-  Value *popSaved();
-
-  void push(BasicBlock *BB, Value *Saved);
-
-  bool isElse(PHINode *Phi);
-
-  void eraseIfUnused(PHINode *Phi);
-
-  void openIf(BranchInst *Term);
-
-  void insertElse(BranchInst *Term);
-
-  Value *handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L);
-
-  void handleLoop(BranchInst *Term);
-
-  void closeControlFlow(BasicBlock *BB);
-
-public:
-  SIAnnotateControlFlow():
-    FunctionPass(ID) { }
-
-  bool doInitialization(Module &M) override;
-
-  bool runOnFunction(Function &F) override;
-
-  const char *getPassName() const override {
-    return "SI annotate control flow";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<LoopInfoWrapperPass>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    FunctionPass::getAnalysisUsage(AU);
-  }
-
-};
-
-} // end anonymous namespace
-
-char SIAnnotateControlFlow::ID = 0;
-
-/// \brief Initialize all the types and constants used in the pass
-bool SIAnnotateControlFlow::doInitialization(Module &M) {
-  LLVMContext &Context = M.getContext();
-
-  Void = Type::getVoidTy(Context);
-  Boolean = Type::getInt1Ty(Context);
-  Int64 = Type::getInt64Ty(Context);
-  ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr);
-
-  BoolTrue = ConstantInt::getTrue(Context);
-  BoolFalse = ConstantInt::getFalse(Context);
-  BoolUndef = UndefValue::get(Boolean);
-  Int64Zero = ConstantInt::get(Int64, 0);
-
-  If = M.getOrInsertFunction(
-    IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr);
-
-  Else = M.getOrInsertFunction(
-    ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr);
-
-  Break = M.getOrInsertFunction(
-    BreakIntrinsic, Int64, Int64, (Type *)nullptr);
-
-  IfBreak = M.getOrInsertFunction(
-    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
-
-  ElseBreak = M.getOrInsertFunction(
-    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
-
-  Loop = M.getOrInsertFunction(
-    LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
-
-  EndCf = M.getOrInsertFunction(
-    EndCfIntrinsic, Void, Int64, (Type *)nullptr);
-
-  return false;
-}
-
-/// \brief Is BB the last block saved on the stack ?
-bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
-  return !Stack.empty() && Stack.back().first == BB;
-}
-
-/// \brief Pop the last saved value from the control flow stack
-Value *SIAnnotateControlFlow::popSaved() {
-  return Stack.pop_back_val().second;
-}
-
-/// \brief Push a BB and saved value to the control flow stack
-void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) {
-  Stack.push_back(std::make_pair(BB, Saved));
-}
-
-/// \brief Can the condition represented by this PHI node treated like
-/// an "Else" block?
-bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
-  BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock();
-  for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
-    if (Phi->getIncomingBlock(i) == IDom) {
-
-      if (Phi->getIncomingValue(i) != BoolTrue)
-        return false;
-
-    } else {
-      if (Phi->getIncomingValue(i) != BoolFalse)
-        return false;
-
-    }
-  }
-  return true;
-}
-
-// \brief Erase "Phi" if it is not used any more
-void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
-  if (!Phi->hasNUsesOrMore(1))
-    Phi->eraseFromParent();
-}
-
-/// \brief Open a new "If" block
-void SIAnnotateControlFlow::openIf(BranchInst *Term) {
-  Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
-  Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
-  push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
-}
-
-/// \brief Close the last "If" block and open a new "Else" block
-void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
-  Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
-  Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
-  push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
-}
-
-/// \brief Recursively handle the condition leading to a loop
-Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
-                                                  llvm::Loop *L) {
-
-  // Only search through PHI nodes which are inside the loop.  If we try this
-  // with PHI nodes that are outside of the loop, we end up inserting new PHI
-  // nodes outside of the loop which depend on values defined inside the loop.
-  // This will break the module with
-  // 'Instruction does not dominate all users!' errors.
-  PHINode *Phi = nullptr;
-  if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) {
-
-    BasicBlock *Parent = Phi->getParent();
-    PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front());
-    Value *Ret = NewPhi;
-
-    // Handle all non-constant incoming values first
-    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
-      Value *Incoming = Phi->getIncomingValue(i);
-      BasicBlock *From = Phi->getIncomingBlock(i);
-      if (isa<ConstantInt>(Incoming)) {
-        NewPhi->addIncoming(Broken, From);
-        continue;
-      }
-
-      Phi->setIncomingValue(i, BoolFalse);
-      Value *PhiArg = handleLoopCondition(Incoming, Broken, L);
-      NewPhi->addIncoming(PhiArg, From);
-    }
-
-    BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
-
-    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
-
-      Value *Incoming = Phi->getIncomingValue(i);
-      if (Incoming != BoolTrue)
-        continue;
-
-      BasicBlock *From = Phi->getIncomingBlock(i);
-      if (From == IDom) {
-        CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
-        if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
-          Value *Args[] = { OldEnd->getArgOperand(0), NewPhi };
-          Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
-          continue;
-        }
-      }
-      TerminatorInst *Insert = From->getTerminator();
-      Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
-      NewPhi->setIncomingValue(i, PhiArg);
-    }
-    eraseIfUnused(Phi);
-    return Ret;
-
-  } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
-    BasicBlock *Parent = Inst->getParent();
-    Instruction *Insert;
-    if (L->contains(Inst)) {
-      Insert = Parent->getTerminator();
-    } else {
-      Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
-    }
-    Value *Args[] = { Cond, Broken };
-    return CallInst::Create(IfBreak, Args, "", Insert);
-
-  } else {
-    llvm_unreachable("Unhandled loop condition!");
-  }
-  return 0;
-}
-
-/// \brief Handle a back edge (loop)
-void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
-  BasicBlock *BB = Term->getParent();
-  llvm::Loop *L = LI->getLoopFor(BB);
-  BasicBlock *Target = Term->getSuccessor(1);
-  PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
-
-  Value *Cond = Term->getCondition();
-  Term->setCondition(BoolTrue);
-  Value *Arg = handleLoopCondition(Cond, Broken, L);
-
-  for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
-       PI != PE; ++PI) {
-
-    Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI);
-  }
-
-  Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
-  push(Term->getSuccessor(0), Arg);
-}/// \brief Close the last opened control flow
-void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
-  llvm::Loop *L = LI->getLoopFor(BB);
-
-  if (L && L->getHeader() == BB) {
-    // We can't insert an EndCF call into a loop header, because it will
-    // get executed on every iteration of the loop, when it should be
-    // executed only once before the loop.
-    SmallVector <BasicBlock*, 8> Latches;
-    L->getLoopLatches(Latches);
-
-    std::vector<BasicBlock*> Preds;
-    for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
-      if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end())
-        Preds.push_back(*PI);
-    }
-    BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT,
-                                      LI, false);
-  }
-
-  CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt());
-}
-
-/// \brief Annotate the control flow with intrinsics so the backend can
-/// recognize if/then/else and loops.
-bool SIAnnotateControlFlow::runOnFunction(Function &F) {
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-
-  for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
-       E = df_end(&F.getEntryBlock()); I != E; ++I) {
-
-    BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator());
-
-    if (!Term || Term->isUnconditional()) {
-      if (isTopOfStack(*I))
-        closeControlFlow(*I);
-      continue;
-    }
-
-    if (I.nodeVisited(Term->getSuccessor(1))) {
-      if (isTopOfStack(*I))
-        closeControlFlow(*I);
-      handleLoop(Term);
-      continue;
-    }
-
-    if (isTopOfStack(*I)) {
-      PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
-      if (Phi && Phi->getParent() == *I && isElse(Phi)) {
-        insertElse(Term);
-        eraseIfUnused(Phi);
-        continue;
-      }
-      closeControlFlow(*I);
-    }
-    openIf(Term);
-  }
-
-  assert(Stack.empty());
-  return true;
-}
-
-/// \brief Create the annotation pass
-FunctionPass *llvm::createSIAnnotateControlFlowPass() {
-  return new SIAnnotateControlFlow();
-}
diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
deleted file mode 100644
index 4727d971ab7..00000000000
--- a/lib/Target/R600/SIDefines.h
+++ /dev/null
@@ -1,172 +0,0 @@
-//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCInstrDesc.h"
-
-#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H
-#define LLVM_LIB_TARGET_R600_SIDEFINES_H
-
-namespace SIInstrFlags {
-// This needs to be kept in sync with the field bits in InstSI.
-enum {
-  SALU = 1 << 3,
-  VALU = 1 << 4,
-
-  SOP1 = 1 << 5,
-  SOP2 = 1 << 6,
-  SOPC = 1 << 7,
-  SOPK = 1 << 8,
-  SOPP = 1 << 9,
-
-  VOP1 = 1 << 10,
-  VOP2 = 1 << 11,
-  VOP3 = 1 << 12,
-  VOPC = 1 << 13,
-
-  MUBUF = 1 << 14,
-  MTBUF = 1 << 15,
-  SMRD = 1 << 16,
-  DS = 1 << 17,
-  MIMG = 1 << 18,
-  FLAT = 1 << 19,
-  WQM = 1 << 20,
-  VGPRSpill = 1 << 21
-};
-}
-
-namespace llvm {
-namespace AMDGPU {
-  enum OperandType {
-    /// Operand with register or 32-bit immediate
-    OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET,
-    /// Operand with register or inline constant
-    OPERAND_REG_INLINE_C
-  };
-}
-}
-
-namespace SIInstrFlags {
-  enum Flags {
-    // First 4 bits are the instruction encoding
-    VM_CNT = 1 << 0,
-    EXP_CNT = 1 << 1,
-    LGKM_CNT = 1 << 2
-  };
-
-  // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
-  // The result is true if any of these tests are true.
-  enum ClassFlags {
-    S_NAN = 1 << 0,        // Signaling NaN
-    Q_NAN = 1 << 1,        // Quiet NaN
-    N_INFINITY = 1 << 2,   // Negative infinity
-    N_NORMAL = 1 << 3,     // Negative normal
-    N_SUBNORMAL = 1 << 4,  // Negative subnormal
-    N_ZERO = 1 << 5,       // Negative zero
-    P_ZERO = 1 << 6,       // Positive zero
-    P_SUBNORMAL = 1 << 7,  // Positive subnormal
-    P_NORMAL = 1 << 8,     // Positive normal
-    P_INFINITY = 1 << 9    // Positive infinity
-  };
-}
-
-namespace SISrcMods {
-  enum {
-   NEG = 1 << 0,
-   ABS = 1 << 1
-  };
-}
-
-namespace SIOutMods {
-  enum {
-    NONE = 0,
-    MUL2 = 1,
-    MUL4 = 2,
-    DIV2 = 3
-  };
-}
-
-#define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
-#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS                                0x00B02C
-#define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
-#define R_00B128_SPI_SHADER_PGM_RSRC1_VS                                0x00B128
-#define R_00B228_SPI_SHADER_PGM_RSRC1_GS                                0x00B228
-#define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
-#define   S_00B028_VGPRS(x)                                           (((x) & 0x3F) << 0)
-#define   S_00B028_SGPRS(x)                                           (((x) & 0x0F) << 6)
-#define R_00B84C_COMPUTE_PGM_RSRC2                                      0x00B84C
-#define   S_00B84C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
-#define   S_00B84C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
-#define   S_00B84C_TGID_X_EN(x)                                       (((x) & 0x1) << 7)
-#define   S_00B84C_TGID_Y_EN(x)                                       (((x) & 0x1) << 8)
-#define   S_00B84C_TGID_Z_EN(x)                                       (((x) & 0x1) << 9)
-#define   S_00B84C_TG_SIZE_EN(x)                                      (((x) & 0x1) << 10)
-#define   S_00B84C_TIDIG_COMP_CNT(x)                                  (((x) & 0x03) << 11)
-
-#define   S_00B84C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 15)
-#define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
-
-
-#define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
-#define   S_00B848_VGPRS(x)                                           (((x) & 0x3F) << 0)
-#define   G_00B848_VGPRS(x)                                           (((x) >> 0) & 0x3F)
-#define   C_00B848_VGPRS                                              0xFFFFFFC0
-#define   S_00B848_SGPRS(x)                                           (((x) & 0x0F) << 6)
-#define   G_00B848_SGPRS(x)                                           (((x) >> 6) & 0x0F)
-#define   C_00B848_SGPRS                                              0xFFFFFC3F
-#define   S_00B848_PRIORITY(x)                                        (((x) & 0x03) << 10)
-#define   G_00B848_PRIORITY(x)                                        (((x) >> 10) & 0x03)
-#define   C_00B848_PRIORITY                                           0xFFFFF3FF
-#define   S_00B848_FLOAT_MODE(x)                                      (((x) & 0xFF) << 12)
-#define   G_00B848_FLOAT_MODE(x)                                      (((x) >> 12) & 0xFF)
-#define   C_00B848_FLOAT_MODE                                         0xFFF00FFF
-#define   S_00B848_PRIV(x)                                            (((x) & 0x1) << 20)
-#define   G_00B848_PRIV(x)                                            (((x) >> 20) & 0x1)
-#define   C_00B848_PRIV                                               0xFFEFFFFF
-#define   S_00B848_DX10_CLAMP(x)                                      (((x) & 0x1) << 21)
-#define   G_00B848_DX10_CLAMP(x)                                      (((x) >> 21) & 0x1)
-#define   C_00B848_DX10_CLAMP                                         0xFFDFFFFF
-#define   S_00B848_DEBUG_MODE(x)                                      (((x) & 0x1) << 22)
-#define   G_00B848_DEBUG_MODE(x)                                      (((x) >> 22) & 0x1)
-#define   C_00B848_DEBUG_MODE                                         0xFFBFFFFF
-#define   S_00B848_IEEE_MODE(x)                                       (((x) & 0x1) << 23)
-#define   G_00B848_IEEE_MODE(x)                                       (((x) >> 23) & 0x1)
-#define   C_00B848_IEEE_MODE                                          0xFF7FFFFF
-
-
-// Helpers for setting FLOAT_MODE
-#define FP_ROUND_ROUND_TO_NEAREST 0
-#define FP_ROUND_ROUND_TO_INF 1
-#define FP_ROUND_ROUND_TO_NEGINF 2
-#define FP_ROUND_ROUND_TO_ZERO 3
-
-// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double
-// precision.
-#define FP_ROUND_MODE_SP(x) ((x) & 0x3)
-#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2)
-
-#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0
-#define FP_DENORM_FLUSH_OUT 1
-#define FP_DENORM_FLUSH_IN 2
-#define FP_DENORM_FLUSH_NONE 3
-
-
-// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double
-// precision.
-#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4)
-#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
-
-#define R_00B860_COMPUTE_TMPRING_SIZE                                   0x00B860
-#define   S_00B860_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
-
-#define R_0286E8_SPI_TMPRING_SIZE                                       0x0286E8
-#define   S_0286E8_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
-
-
-#endif
diff --git a/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp b/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp
deleted file mode 100644
index 5fe8d19426d..00000000000
--- a/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-//===-- SIFixControlFlowLiveIntervals.cpp - Fix CF live intervals ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Spilling of EXEC masks used for control flow messes up control flow
-/// lowering, so mark all live intervals associated with CF instructions as
-/// non-spillable.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-fix-cf-live-intervals"
-
-namespace {
-
-class SIFixControlFlowLiveIntervals : public MachineFunctionPass {
-public:
-  static char ID;
-
-public:
-  SIFixControlFlowLiveIntervals() : MachineFunctionPass(ID) {
-    initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI Fix CF Live Intervals";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<LiveIntervals>();
-    AU.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SIFixControlFlowLiveIntervals, DEBUG_TYPE,
-                      "SI Fix CF Live Intervals", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(SIFixControlFlowLiveIntervals, DEBUG_TYPE,
-                    "SI Fix CF Live Intervals", false, false)
-
-char SIFixControlFlowLiveIntervals::ID = 0;
-
-char &llvm::SIFixControlFlowLiveIntervalsID = SIFixControlFlowLiveIntervals::ID;
-
-FunctionPass *llvm::createSIFixControlFlowLiveIntervalsPass() {
-  return new SIFixControlFlowLiveIntervals();
-}
-
-bool SIFixControlFlowLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
-  LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
-
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      switch (MI.getOpcode()) {
-        case AMDGPU::SI_IF:
-        case AMDGPU::SI_ELSE:
-        case AMDGPU::SI_BREAK:
-        case AMDGPU::SI_IF_BREAK:
-        case AMDGPU::SI_ELSE_BREAK:
-        case AMDGPU::SI_END_CF: {
-          unsigned Reg = MI.getOperand(0).getReg();
-          LIS->getInterval(Reg).markNotSpillable();
-          break;
-        }
-        default:
-          break;
-      }
-    }
-  }
-
-  return false;
-}
diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp
deleted file mode 100644
index 23502b45905..00000000000
--- a/lib/Target/R600/SIFixSGPRCopies.cpp
+++ /dev/null
@@ -1,338 +0,0 @@
-//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Copies from VGPR to SGPR registers are illegal and the register coalescer
-/// will sometimes generate these illegal copies in situations like this:
-///
-///  Register Class <vsrc> is the union of <vgpr> and <sgpr>
-///
-/// BB0:
-///   %vreg0 <sgpr> = SCALAR_INST
-///   %vreg1 <vsrc> = COPY %vreg0 <sgpr>
-///    ...
-///    BRANCH %cond BB1, BB2
-///  BB1:
-///    %vreg2 <vgpr> = VECTOR_INST
-///    %vreg3 <vsrc> = COPY %vreg2 <vgpr>
-///  BB2:
-///    %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1>
-///    %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc>
-///
-///
-/// The coalescer will begin at BB0 and eliminate its copy, then the resulting
-/// code will look like this:
-///
-/// BB0:
-///   %vreg0 <sgpr> = SCALAR_INST
-///    ...
-///    BRANCH %cond BB1, BB2
-/// BB1:
-///   %vreg2 <vgpr> = VECTOR_INST
-///   %vreg3 <vsrc> = COPY %vreg2 <vgpr>
-/// BB2:
-///   %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <vsrc>, <BB#1>
-///   %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
-///
-/// Now that the result of the PHI instruction is an SGPR, the register
-/// allocator is now forced to constrain the register class of %vreg3 to
-/// <sgpr> so we end up with final code like this:
-///
-/// BB0:
-///   %vreg0 <sgpr> = SCALAR_INST
-///    ...
-///    BRANCH %cond BB1, BB2
-/// BB1:
-///   %vreg2 <vgpr> = VECTOR_INST
-///   %vreg3 <sgpr> = COPY %vreg2 <vgpr>
-/// BB2:
-///   %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1>
-///   %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
-///
-/// Now this code contains an illegal copy from a VGPR to an SGPR.
-///
-/// In order to avoid this problem, this pass searches for PHI instructions
-/// which define a <vsrc> register and constrains its definition class to
-/// <vgpr> if the user of the PHI's definition register is a vector instruction.
-/// If the PHI's definition class is constrained to <vgpr> then the coalescer
-/// will be unable to perform the COPY removal from the above example  which
-/// ultimately led to the creation of an illegal COPY.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "sgpr-copies"
-
-namespace {
-
-class SIFixSGPRCopies : public MachineFunctionPass {
-
-private:
-  static char ID;
-  const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI,
-                                           const MachineRegisterInfo &MRI,
-                                           unsigned Reg,
-                                           unsigned SubReg) const;
-  const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI,
-                                                 const MachineRegisterInfo &MRI,
-                                                 unsigned Reg,
-                                                 unsigned SubReg) const;
-  bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI,
-                        const MachineRegisterInfo &MRI) const;
-
-public:
-  SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI Fix SGPR copies";
-  }
-
-};
-
-} // End anonymous namespace
-
-char SIFixSGPRCopies::ID = 0;
-
-FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) {
-  return new SIFixSGPRCopies(tm);
-}
-
-static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
-  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    if (!MI.getOperand(i).isReg() ||
-        !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
-      continue;
-
-    if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg())))
-      return true;
-  }
-  return false;
-}
-
-/// This functions walks the use list of Reg until it finds an Instruction
-/// that isn't a COPY returns the register class of that instruction.
-/// \return The register defined by the first non-COPY instruction.
-const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses(
-                                                 const SIRegisterInfo *TRI,
-                                                 const MachineRegisterInfo &MRI,
-                                                 unsigned Reg,
-                                                 unsigned SubReg) const {
-
-  const TargetRegisterClass *RC
-    = TargetRegisterInfo::isVirtualRegister(Reg) ?
-    MRI.getRegClass(Reg) :
-    TRI->getPhysRegClass(Reg);
-
-  RC = TRI->getSubRegClass(RC, SubReg);
-  for (MachineRegisterInfo::use_instr_iterator
-       I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) {
-    switch (I->getOpcode()) {
-    case AMDGPU::COPY:
-      RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI,
-                                  I->getOperand(0).getReg(),
-                                  I->getOperand(0).getSubReg()));
-      break;
-    }
-  }
-
-  return RC;
-}
-
-const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef(
-                                                 const SIRegisterInfo *TRI,
-                                                 const MachineRegisterInfo &MRI,
-                                                 unsigned Reg,
-                                                 unsigned SubReg) const {
-  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
-    const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg);
-    return TRI->getSubRegClass(RC, SubReg);
-  }
-  MachineInstr *Def = MRI.getVRegDef(Reg);
-  if (Def->getOpcode() != AMDGPU::COPY) {
-    return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg);
-  }
-
-  return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(),
-                                   Def->getOperand(1).getSubReg());
-}
-
-bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
-                                      const SIRegisterInfo *TRI,
-                                      const MachineRegisterInfo &MRI) const {
-
-  unsigned DstReg = Copy.getOperand(0).getReg();
-  unsigned SrcReg = Copy.getOperand(1).getReg();
-  unsigned SrcSubReg = Copy.getOperand(1).getSubReg();
-
-  if (!TargetRegisterInfo::isVirtualRegister(DstReg)) {
-    // If the destination register is a physical register there isn't really
-    // much we can do to fix this.
-    return false;
-  }
-
-  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
-
-  const TargetRegisterClass *SrcRC;
-
-  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
-      MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass)
-    return false;
-
-  SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg);
-  return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC);
-}
-
-bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-                                                  BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-                                                      I != E; ++I) {
-      MachineInstr &MI = *I;
-      if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) {
-        DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n");
-        DEBUG(MI.print(dbgs()));
-        TII->moveToVALU(MI);
-
-      }
-
-      switch (MI.getOpcode()) {
-      default: continue;
-      case AMDGPU::PHI: {
-        DEBUG(dbgs() << "Fixing PHI: " << MI);
-
-        for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
-          const MachineOperand &Op = MI.getOperand(i);
-          unsigned Reg = Op.getReg();
-          const TargetRegisterClass *RC
-            = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg());
-
-          MRI.constrainRegClass(Op.getReg(), RC);
-        }
-        unsigned Reg = MI.getOperand(0).getReg();
-        const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg,
-                                                  MI.getOperand(0).getSubReg());
-        if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) {
-          MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass);
-        }
-
-        if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
-          break;
-
-        // If a PHI node defines an SGPR and any of its operands are VGPRs,
-        // then we need to move it to the VALU.
-        //
-        // Also, if a PHI node defines an SGPR and has all SGPR operands
-        // we must move it to the VALU, because the SGPR operands will
-        // all end up being assigned the same register, which means
-        // there is a potential for a conflict if different threads take
-        // different control flow paths.
-        //
-        // For Example:
-        //
-        // sgpr0 = def;
-        // ...
-        // sgpr1 = def;
-        // ...
-        // sgpr2 = PHI sgpr0, sgpr1
-        // use sgpr2;
-        //
-        // Will Become:
-        //
-        // sgpr2 = def;
-        // ...
-        // sgpr2 = def;
-        // ...
-        // use sgpr2
-        //
-        // FIXME: This is OK if the branching decision is made based on an
-        // SGPR value.
-        bool SGPRBranch = false;
-
-        // The one exception to this rule is when one of the operands
-        // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
-        // instruction.  In this case, there we know the program will
-        // never enter the second block (the loop) without entering
-        // the first block (where the condition is computed), so there
-        // is no chance for values to be over-written.
-
-        bool HasBreakDef = false;
-        for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
-          unsigned Reg = MI.getOperand(i).getReg();
-          if (TRI->hasVGPRs(MRI.getRegClass(Reg))) {
-            TII->moveToVALU(MI);
-            break;
-          }
-          MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg);
-          assert(DefInstr);
-          switch(DefInstr->getOpcode()) {
-
-          case AMDGPU::SI_BREAK:
-          case AMDGPU::SI_IF_BREAK:
-          case AMDGPU::SI_ELSE_BREAK:
-          // If we see a PHI instruction that defines an SGPR, then that PHI
-          // instruction has already been considered and should have
-          // a *_BREAK as an operand.
-          case AMDGPU::PHI:
-            HasBreakDef = true;
-            break;
-          }
-        }
-
-        if (!SGPRBranch && !HasBreakDef)
-          TII->moveToVALU(MI);
-        break;
-      }
-      case AMDGPU::REG_SEQUENCE: {
-        if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
-            !hasVGPROperands(MI, TRI))
-          continue;
-
-        DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
-
-        TII->moveToVALU(MI);
-        break;
-      }
-      case AMDGPU::INSERT_SUBREG: {
-        const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
-        DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
-        Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
-        Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
-        if (TRI->isSGPRClass(DstRC) &&
-            (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
-          DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
-          TII->moveToVALU(MI);
-        }
-        break;
-      }
-      }
-    }
-  }
-
-  return true;
-}
diff --git a/lib/Target/R600/SIFixSGPRLiveRanges.cpp b/lib/Target/R600/SIFixSGPRLiveRanges.cpp
deleted file mode 100644
index 0c54446b0fb..00000000000
--- a/lib/Target/R600/SIFixSGPRLiveRanges.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// SALU instructions ignore control flow, so we need to modify the live ranges
-/// of the registers they define in some cases.
-///
-/// The main case we need to handle is when a def is used in one side of a
-/// branch and not another.  For example:
-///
-/// %def
-/// IF
-///   ...
-///   ...
-/// ELSE
-///   %use
-///   ...
-/// ENDIF
-///
-/// Here we need the register allocator to avoid assigning any of the defs
-/// inside of the IF to the same register as %def.  In traditional live
-/// interval analysis %def is not live inside the IF branch, however, since
-/// SALU instructions inside of IF will be executed even if the branch is not
-/// taken, there is the chance that one of the instructions will overwrite the
-/// value of %def, so the use in ELSE will see the wrong value.
-///
-/// The strategy we use for solving this is to add an extra use after the ENDIF:
-///
-/// %def
-/// IF
-///   ...
-///   ...
-/// ELSE
-///   %use
-///   ...
-/// ENDIF
-/// %use
-///
-/// Adding this use will make the def live thoughout the IF branch, which is
-/// what we want.
-
-#include "AMDGPU.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-fix-sgpr-live-ranges"
-
-namespace {
-
-class SIFixSGPRLiveRanges : public MachineFunctionPass {
-public:
-  static char ID;
-
-public:
-  SIFixSGPRLiveRanges() : MachineFunctionPass(ID) {
-    initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI Fix SGPR live ranges";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<LiveIntervals>();
-    AU.addRequired<MachinePostDominatorTree>();
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE,
-                      "SI Fix SGPR Live Ranges", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
-INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE,
-                    "SI Fix SGPR Live Ranges", false, false)
-
-char SIFixSGPRLiveRanges::ID = 0;
-
-char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID;
-
-FunctionPass *llvm::createSIFixSGPRLiveRangesPass() {
-  return new SIFixSGPRLiveRanges();
-}
-
-bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
-      MF.getSubtarget().getRegisterInfo());
-  LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
- MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
-  std::vector<std::pair<unsigned, LiveRange *>> SGPRLiveRanges;
-
-  // First pass, collect all live intervals for SGPRs
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      for (const MachineOperand &MO : MI.defs()) {
-        if (MO.isImplicit())
-          continue;
-        unsigned Def = MO.getReg();
-        if (TargetRegisterInfo::isVirtualRegister(Def)) {
-          if (TRI->isSGPRClass(MRI.getRegClass(Def)))
-            SGPRLiveRanges.push_back(
-                std::make_pair(Def, &LIS->getInterval(Def)));
-        } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) {
-            SGPRLiveRanges.push_back(
-                std::make_pair(Def, &LIS->getRegUnit(Def)));
-        }
-      }
-    }
-  }
-
-  // Second pass fix the intervals
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-                                                  BI != BE; ++BI) {
-    MachineBasicBlock &MBB = *BI;
-    if (MBB.succ_size() < 2)
-      continue;
-
-    // We have structured control flow, so number of succesors should be two.
-    assert(MBB.succ_size() == 2);
-    MachineBasicBlock *SuccA = *MBB.succ_begin();
-    MachineBasicBlock *SuccB = *(++MBB.succ_begin());
-    MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB);
-
-    if (!NCD)
-      continue;
-
-    MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator();
-
-    if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) {
-      assert(NCD->succ_size() == 2);
-      // We want to make sure we insert the Use after the ENDIF, not after
-      // the ELSE.
-      NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(),
-                                            *(++NCD->succ_begin()));
-    }
-    assert(SuccA && SuccB);
-    for (std::pair<unsigned, LiveRange*> RegLR : SGPRLiveRanges) {
-      unsigned Reg = RegLR.first;
-      LiveRange *LR = RegLR.second;
-
-      // FIXME: We could be smarter here.  If the register is Live-In to
-      // one block, but the other doesn't have any SGPR defs, then there
-      // won't be a conflict.  Also, if the branch decision is based on
-      // a value in an SGPR, then there will be no conflict.
-      bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA);
-      bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB);
-
-      if ((!LiveInToA && !LiveInToB) ||
-          (LiveInToA && LiveInToB))
-        continue;
-
-      // This interval is live in to one successor, but not the other, so
-      // we need to update its range so it is live in to both.
-      DEBUG(dbgs() << "Possible SGPR conflict detected " <<  " in " << *LR <<
-                      " BB#" << SuccA->getNumber() << ", BB#" <<
-                      SuccB->getNumber() <<
-                      " with NCD = " << NCD->getNumber() << '\n');
-
-      // FIXME: Need to figure out how to update LiveRange here so this pass
-      // will be able to preserve LiveInterval analysis.
-      BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(),
-              TII->get(AMDGPU::SGPR_USE))
-              .addReg(Reg, RegState::Implicit);
-      DEBUG(NCD->getFirstNonPHI()->dump());
-    }
-  }
-
-  return false;
-}
diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp
deleted file mode 100644
index d14e37a6461..00000000000
--- a/lib/Target/R600/SIFoldOperands.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//===----------------------------------------------------------------------===//
-//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-
-#define DEBUG_TYPE "si-fold-operands"
-using namespace llvm;
-
-namespace {
-
-class SIFoldOperands : public MachineFunctionPass {
-public:
-  static char ID;
-
-public:
-  SIFoldOperands() : MachineFunctionPass(ID) {
-    initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI Fold Operands";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineDominatorTree>();
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-struct FoldCandidate {
-  MachineInstr *UseMI;
-  unsigned UseOpNo;
-  MachineOperand *OpToFold;
-  uint64_t ImmToFold;
-
-  FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
-                UseMI(MI), UseOpNo(OpNo) {
-
-    if (FoldOp->isImm()) {
-      OpToFold = nullptr;
-      ImmToFold = FoldOp->getImm();
-    } else {
-      assert(FoldOp->isReg());
-      OpToFold = FoldOp;
-    }
-  }
-
-  bool isImm() const {
-    return !OpToFold;
-  }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE,
-                      "SI Fold Operands", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE,
-                    "SI Fold Operands", false, false)
-
-char SIFoldOperands::ID = 0;
-
-char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
-
-FunctionPass *llvm::createSIFoldOperandsPass() {
-  return new SIFoldOperands();
-}
-
-static bool isSafeToFold(unsigned Opcode) {
-  switch(Opcode) {
-  case AMDGPU::V_MOV_B32_e32:
-  case AMDGPU::V_MOV_B32_e64:
-  case AMDGPU::V_MOV_B64_PSEUDO:
-  case AMDGPU::S_MOV_B32:
-  case AMDGPU::S_MOV_B64:
-  case AMDGPU::COPY:
-    return true;
-  default:
-    return false;
-  }
-}
-
-static bool updateOperand(FoldCandidate &Fold,
-                          const TargetRegisterInfo &TRI) {
-  MachineInstr *MI = Fold.UseMI;
-  MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
-  assert(Old.isReg());
-
-  if (Fold.isImm()) {
-    Old.ChangeToImmediate(Fold.ImmToFold);
-    return true;
-  }
-
-  MachineOperand *New = Fold.OpToFold;
-  if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
-      TargetRegisterInfo::isVirtualRegister(New->getReg())) {
-    Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
-    return true;
-  }
-
-  // FIXME: Handle physical registers.
-
-  return false;
-}
-
-static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
-                             MachineInstr *MI, unsigned OpNo,
-                             MachineOperand *OpToFold,
-                             const SIInstrInfo *TII) {
-  if (!TII->isOperandLegal(MI, OpNo, OpToFold)) {
-    // Operand is not legal, so try to commute the instruction to
-    // see if this makes it possible to fold.
-    unsigned CommuteIdx0;
-    unsigned CommuteIdx1;
-    bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);
-
-    if (CanCommute) {
-      if (CommuteIdx0 == OpNo)
-        OpNo = CommuteIdx1;
-      else if (CommuteIdx1 == OpNo)
-        OpNo = CommuteIdx0;
-    }
-
-    if (!CanCommute || !TII->commuteInstruction(MI))
-      return false;
-
-    if (!TII->isOperandLegal(MI, OpNo, OpToFold))
-      return false;
-  }
-
-  FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
-  return true;
-}
-
-bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-                                                  BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
-    MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
-      Next = std::next(I);
-      MachineInstr &MI = *I;
-
-      if (!isSafeToFold(MI.getOpcode()))
-        continue;
-
-      unsigned OpSize = TII->getOpSize(MI, 1);
-      MachineOperand &OpToFold = MI.getOperand(1);
-      bool FoldingImm = OpToFold.isImm();
-
-      // FIXME: We could also be folding things like FrameIndexes and
-      // TargetIndexes.
-      if (!FoldingImm && !OpToFold.isReg())
-        continue;
-
-      // Folding immediates with more than one use will increase program size.
-      // FIXME: This will also reduce register usage, which may be better
-      // in some cases.  A better heuristic is needed.
-      if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) &&
-          !MRI.hasOneUse(MI.getOperand(0).getReg()))
-        continue;
-
-      // FIXME: Fold operands with subregs.
-      if (OpToFold.isReg() &&
-          (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) ||
-           OpToFold.getSubReg()))
-        continue;
-
-      std::vector<FoldCandidate> FoldList;
-      for (MachineRegisterInfo::use_iterator
-           Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
-           Use != E; ++Use) {
-
-        MachineInstr *UseMI = Use->getParent();
-        const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo());
-
-        // FIXME: Fold operands with subregs.
-        if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) ||
-            UseOp.isImplicit())) {
-          continue;
-        }
-
-        APInt Imm;
-
-        if (FoldingImm) {
-          unsigned UseReg = UseOp.getReg();
-          const TargetRegisterClass *UseRC
-            = TargetRegisterInfo::isVirtualRegister(UseReg) ?
-            MRI.getRegClass(UseReg) :
-            TRI.getPhysRegClass(UseReg);
-
-          Imm = APInt(64, OpToFold.getImm());
-
-          // Split 64-bit constants into 32-bits for folding.
-          if (UseOp.getSubReg()) {
-            if (UseRC->getSize() != 8)
-              continue;
-
-            if (UseOp.getSubReg() == AMDGPU::sub0) {
-              Imm = Imm.getLoBits(32);
-            } else {
-              assert(UseOp.getSubReg() == AMDGPU::sub1);
-              Imm = Imm.getHiBits(32);
-            }
-          }
-
-          // In order to fold immediates into copies, we need to change the
-          // copy to a MOV.
-          if (UseMI->getOpcode() == AMDGPU::COPY) {
-            unsigned DestReg = UseMI->getOperand(0).getReg();
-            const TargetRegisterClass *DestRC
-              = TargetRegisterInfo::isVirtualRegister(DestReg) ?
-              MRI.getRegClass(DestReg) :
-              TRI.getPhysRegClass(DestReg);
-
-            unsigned MovOp = TII->getMovOpcode(DestRC);
-            if (MovOp == AMDGPU::COPY)
-              continue;
-
-            UseMI->setDesc(TII->get(MovOp));
-          }
-        }
-
-        const MCInstrDesc &UseDesc = UseMI->getDesc();
-
-        // Don't fold into target independent nodes.  Target independent opcodes
-        // don't have defined register classes.
-        if (UseDesc.isVariadic() ||
-            UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
-          continue;
-
-        if (FoldingImm) {
-          MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
-          tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII);
-          continue;
-        }
-
-        tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII);
-
-        // FIXME: We could try to change the instruction from 64-bit to 32-bit
-        // to enable more folding opportunites.  The shrink operands pass
-        // already does this.
-      }
-
-      for (FoldCandidate &Fold : FoldList) {
-        if (updateOperand(Fold, TRI)) {
-          // Clear kill flags.
-          if (!Fold.isImm()) {
-            assert(Fold.OpToFold && Fold.OpToFold->isReg());
-            Fold.OpToFold->setIsKill(false);
-          }
-          DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
-                Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
-        }
-      }
-    }
-  }
-  return false;
-}
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
deleted file mode 100644
index 12d08cf4c7f..00000000000
--- a/lib/Target/R600/SIISelLowering.cpp
+++ /dev/null
@@ -1,2241 +0,0 @@
-//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Custom DAG lowering for SI
-//
-//===----------------------------------------------------------------------===//
-
-#ifdef _MSC_VER
-// Provide M_PI.
-#define _USE_MATH_DEFINES
-#include <cmath>
-#endif
-
-#include "SIISelLowering.h"
-#include "AMDGPU.h"
-#include "AMDGPUIntrinsicInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/IR/Function.h"
-#include "llvm/ADT/SmallString.h"
-
-using namespace llvm;
-
-SITargetLowering::SITargetLowering(TargetMachine &TM,
-                                   const AMDGPUSubtarget &STI)
-    : AMDGPUTargetLowering(TM, STI) {
-  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
-  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
-
-  addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
-  addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
-
-  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
-  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
-
-  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
-  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
-  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
-
-  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
-  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
-
-  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
-  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
-
-  addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
-  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
-
-  computeRegisterProperties(STI.getRegisterInfo());
-
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
-  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
-
-  setOperationAction(ISD::ADD, MVT::i32, Legal);
-  setOperationAction(ISD::ADDC, MVT::i32, Legal);
-  setOperationAction(ISD::ADDE, MVT::i32, Legal);
-  setOperationAction(ISD::SUBC, MVT::i32, Legal);
-  setOperationAction(ISD::SUBE, MVT::i32, Legal);
-
-  setOperationAction(ISD::FSIN, MVT::f32, Custom);
-  setOperationAction(ISD::FCOS, MVT::f32, Custom);
-
-  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
-
-  // We need to custom lower vector stores from local memory
-  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
-
-  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v16i32, Custom);
-
-  setOperationAction(ISD::STORE, MVT::i1, Custom);
-  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
-
-  setOperationAction(ISD::SELECT, MVT::i64, Custom);
-  setOperationAction(ISD::SELECT, MVT::f64, Promote);
-  AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
-
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
-
-  setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
-  setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
-
-  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
-
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
-
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
-
-  for (MVT VT : MVT::integer_valuetypes()) {
-    if (VT == MVT::i64)
-      continue;
-
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
-
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
-
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
-  }
-
-  for (MVT VT : MVT::integer_vector_valuetypes()) {
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand);
-  }
-
-  for (MVT VT : MVT::fp_valuetypes())
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
-
-  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
-  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
-  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
-
-  setOperationAction(ISD::LOAD, MVT::i1, Custom);
-
-  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
-
-  // These should use UDIVREM, so set them to expand
-  setOperationAction(ISD::UDIV, MVT::i64, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
-
-  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
-  setOperationAction(ISD::SELECT, MVT::i1, Promote);
-
-  // We only support LOAD/STORE and vector manipulation ops for vectors
-  // with > 4 elements.
-  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) {
-    for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
-      switch(Op) {
-      case ISD::LOAD:
-      case ISD::STORE:
-      case ISD::BUILD_VECTOR:
-      case ISD::BITCAST:
-      case ISD::EXTRACT_VECTOR_ELT:
-      case ISD::INSERT_VECTOR_ELT:
-      case ISD::INSERT_SUBVECTOR:
-      case ISD::EXTRACT_SUBVECTOR:
-        break;
-      case ISD::CONCAT_VECTORS:
-        setOperationAction(Op, VT, Custom);
-        break;
-      default:
-        setOperationAction(Op, VT, Expand);
-        break;
-      }
-    }
-  }
-
-  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
-    setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
-    setOperationAction(ISD::FCEIL, MVT::f64, Legal);
-    setOperationAction(ISD::FRINT, MVT::f64, Legal);
-  }
-
-  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
-  setOperationAction(ISD::FDIV, MVT::f32, Custom);
-  setOperationAction(ISD::FDIV, MVT::f64, Custom);
-
-  setTargetDAGCombine(ISD::FADD);
-  setTargetDAGCombine(ISD::FSUB);
-  setTargetDAGCombine(ISD::FMINNUM);
-  setTargetDAGCombine(ISD::FMAXNUM);
-  setTargetDAGCombine(ISD::SMIN);
-  setTargetDAGCombine(ISD::SMAX);
-  setTargetDAGCombine(ISD::UMIN);
-  setTargetDAGCombine(ISD::UMAX);
-  setTargetDAGCombine(ISD::SELECT_CC);
-  setTargetDAGCombine(ISD::SETCC);
-  setTargetDAGCombine(ISD::AND);
-  setTargetDAGCombine(ISD::OR);
-  setTargetDAGCombine(ISD::UINT_TO_FP);
-
-  // All memory operations. Some folding on the pointer operand is done to help
-  // matching the constant offsets in the addressing modes.
-  setTargetDAGCombine(ISD::LOAD);
-  setTargetDAGCombine(ISD::STORE);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD);
-  setTargetDAGCombine(ISD::ATOMIC_STORE);
-  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
-  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
-  setTargetDAGCombine(ISD::ATOMIC_SWAP);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
-  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
-
-  setSchedulingPreference(Sched::RegPressure);
-}
-
-//===----------------------------------------------------------------------===//
-// TargetLowering queries
-//===----------------------------------------------------------------------===//
-
-bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
-                                          EVT) const {
-  // SI has some legal vector types, but no legal vector operations. Say no
-  // shuffles are legal in order to prefer scalarizing some vector operations.
-  return false;
-}
-
-bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                             Type *Ty, unsigned AS) const {
-  // No global is ever allowed as a base.
-  if (AM.BaseGV)
-    return false;
-
-  switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
-  case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
-  case AMDGPUAS::PRIVATE_ADDRESS:
-  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
-    // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
-    // additionally can do r + r + i with addr64. 32-bit has more addressing
-    // mode options. Depending on the resource constant, it can also do
-    // (i64 r0) + (i32 r1) * (i14 i).
-    //
-    // SMRD instructions have an 8-bit, dword offset.
-    //
-    // Assume nonunifom access, since the address space isn't enough to know
-    // what instruction we will use, and since we don't know if this is a load
-    // or store and scalar stores are only available on VI.
-    //
-    // We also know if we are doing an extload, we can't do a scalar load.
-    //
-    // Private arrays end up using a scratch buffer most of the time, so also
-    // assume those use MUBUF instructions. Scratch loads / stores are currently
-    // implemented as mubuf instructions with offen bit set, so slightly
-    // different than the normal addr64.
-    if (!isUInt<12>(AM.BaseOffs))
-      return false;
-
-    // FIXME: Since we can split immediate into soffset and immediate offset,
-    // would it make sense to allow any immediate?
-
-    switch (AM.Scale) {
-    case 0: // r + i or just i, depending on HasBaseReg.
-      return true;
-    case 1:
-      return true; // We have r + r or r + i.
-    case 2:
-      if (AM.HasBaseReg) {
-        // Reject 2 * r + r.
-        return false;
-      }
-
-      // Allow 2 * r as r + r
-      // Or  2 * r + i is allowed as r + r + i.
-      return true;
-    default: // Don't allow n * r
-      return false;
-    }
-  }
-  case AMDGPUAS::LOCAL_ADDRESS:
-  case AMDGPUAS::REGION_ADDRESS: {
-    // Basic, single offset DS instructions allow a 16-bit unsigned immediate
-    // field.
-    // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
-    // an 8-bit dword offset but we don't know the alignment here.
-    if (!isUInt<16>(AM.BaseOffs))
-      return false;
-
-    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
-      return true;
-
-    if (AM.Scale == 1 && AM.HasBaseReg)
-      return true;
-
-    return false;
-  }
-  case AMDGPUAS::FLAT_ADDRESS: {
-    // Flat instructions do not have offsets, and only have the register
-    // address.
-    return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
-  }
-  default:
-    llvm_unreachable("unhandled address space");
-  }
-}
-
-bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
-                                                      unsigned AddrSpace,
-                                                      unsigned Align,
-                                                      bool *IsFast) const {
-  if (IsFast)
-    *IsFast = false;
-
-  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
-  // which isn't a simple VT.
-  if (!VT.isSimple() || VT == MVT::Other)
-    return false;
-
-  // TODO - CI+ supports unaligned memory accesses, but this requires driver
-  // support.
-
-  // XXX - The only mention I see of this in the ISA manual is for LDS direct
-  // reads the "byte address and must be dword aligned". Is it also true for the
-  // normal loads and stores?
-  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) {
-    // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
-    // aligned, 8 byte access in a single operation using ds_read2/write2_b32
-    // with adjacent offsets.
-    return Align % 4 == 0;
-  }
-
-  // Smaller than dword value must be aligned.
-  // FIXME: This should be allowed on CI+
-  if (VT.bitsLT(MVT::i32))
-    return false;
-
-  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
-  // byte-address are ignored, thus forcing Dword alignment.
-  // This applies to private, global, and constant memory.
-  if (IsFast)
-    *IsFast = true;
-
-  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
-}
-
-EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
-                                          unsigned SrcAlign, bool IsMemset,
-                                          bool ZeroMemset,
-                                          bool MemcpyStrSrc,
-                                          MachineFunction &MF) const {
-  // FIXME: Should account for address space here.
-
-  // The default fallback uses the private pointer size as a guess for a type to
-  // use. Make sure we switch these to 64-bit accesses.
-
-  if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
-    return MVT::v4i32;
-
-  if (Size >= 8 && DstAlign >= 4)
-    return MVT::v2i32;
-
-  // Use the default.
-  return MVT::Other;
-}
-
-TargetLoweringBase::LegalizeTypeAction
-SITargetLowering::getPreferredVectorAction(EVT VT) const {
-  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
-    return TypeSplitVector;
-
-  return TargetLoweringBase::getPreferredVectorAction(VT);
-}
-
-bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                                         Type *Ty) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-  return TII->isInlineConstant(Imm);
-}
-
-static EVT toIntegerVT(EVT VT) {
-  if (VT.isVector())
-    return VT.changeVectorElementTypeToInteger();
-  return MVT::getIntegerVT(VT.getSizeInBits());
-}
-
-SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
-                                         SDLoc SL, SDValue Chain,
-                                         unsigned Offset, bool Signed) const {
-  const DataLayout *DL = getDataLayout();
-  MachineFunction &MF = DAG.getMachineFunction();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
-  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
-
-  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
-
-  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
-  MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
-  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
-  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
-                                       MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
-  SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
-                            DAG.getConstant(Offset, SL, PtrVT));
-  SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
-  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
-
-  unsigned Align = DL->getABITypeAlignment(Ty);
-
-  if (VT != MemVT && VT.isFloatingPoint()) {
-    // Do an integer load and convert.
-    // FIXME: This is mostly because load legalization after type legalization
-    // doesn't handle FP extloads.
-    assert(VT.getScalarType() == MVT::f32 &&
-           MemVT.getScalarType() == MVT::f16);
-
-    EVT IVT = toIntegerVT(VT);
-    EVT MemIVT = toIntegerVT(MemVT);
-    SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD,
-                               IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT,
-                               false, // isVolatile
-                               true, // isNonTemporal
-                               true, // isInvariant
-                               Align); // Alignment
-    return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load);
-  }
-
-  ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
-  return DAG.getLoad(ISD::UNINDEXED, ExtTy,
-                     VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
-                     false, // isVolatile
-                     true, // isNonTemporal
-                     true, // isInvariant
-                     Align); // Alignment
-}
-
-SDValue SITargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  FunctionType *FType = MF.getFunction()->getFunctionType();
-  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-
-  assert(CallConv == CallingConv::C);
-
-  SmallVector<ISD::InputArg, 16> Splits;
-  BitVector Skipped(Ins.size());
-
-  for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
-    const ISD::InputArg &Arg = Ins[i];
-
-    // First check if it's a PS input addr
-    if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
-        !Arg.Flags.isByVal()) {
-
-      assert((PSInputNum <= 15) && "Too many PS inputs!");
-
-      if (!Arg.Used) {
-        // We can savely skip PS inputs
-        Skipped.set(i);
-        ++PSInputNum;
-        continue;
-      }
-
-      Info->PSInputAddr |= 1 << PSInputNum++;
-    }
-
-    // Second split vertices into their elements
-    if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) {
-      ISD::InputArg NewArg = Arg;
-      NewArg.Flags.setSplit();
-      NewArg.VT = Arg.VT.getVectorElementType();
-
-      // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
-      // three or five element vertex only needs three or five registers,
-      // NOT four or eigth.
-      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
-      unsigned NumElements = ParamType->getVectorNumElements();
-
-      for (unsigned j = 0; j != NumElements; ++j) {
-        Splits.push_back(NewArg);
-        NewArg.PartOffset += NewArg.VT.getStoreSize();
-      }
-
-    } else if (Info->getShaderType() != ShaderType::COMPUTE) {
-      Splits.push_back(Arg);
-    }
-  }
-
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                 *DAG.getContext());
-
-  // At least one interpolation mode must be enabled or else the GPU will hang.
-  if (Info->getShaderType() == ShaderType::PIXEL &&
-      (Info->PSInputAddr & 0x7F) == 0) {
-    Info->PSInputAddr |= 1;
-    CCInfo.AllocateReg(AMDGPU::VGPR0);
-    CCInfo.AllocateReg(AMDGPU::VGPR1);
-  }
-
-  // The pointer to the list of arguments is stored in SGPR0, SGPR1
-	// The pointer to the scratch buffer is stored in SGPR2, SGPR3
-  if (Info->getShaderType() == ShaderType::COMPUTE) {
-    if (Subtarget->isAmdHsaOS())
-      Info->NumUserSGPRs = 2;  // FIXME: Need to support scratch buffers.
-    else
-      Info->NumUserSGPRs = 4;
-
-    unsigned InputPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
-    unsigned InputPtrRegLo =
-        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
-    unsigned InputPtrRegHi =
-        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
-    unsigned ScratchPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
-    unsigned ScratchPtrRegLo =
-        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0);
-    unsigned ScratchPtrRegHi =
-        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
-    CCInfo.AllocateReg(InputPtrRegLo);
-    CCInfo.AllocateReg(InputPtrRegHi);
-    CCInfo.AllocateReg(ScratchPtrRegLo);
-    CCInfo.AllocateReg(ScratchPtrRegHi);
-    MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
-    MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass);
-  }
-
-  if (Info->getShaderType() == ShaderType::COMPUTE) {
-    getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
-                            Splits);
-  }
-
-  AnalyzeFormalArguments(CCInfo, Splits);
-
-  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
-
-    const ISD::InputArg &Arg = Ins[i];
-    if (Skipped[i]) {
-      InVals.push_back(DAG.getUNDEF(Arg.VT));
-      continue;
-    }
-
-    CCValAssign &VA = ArgLocs[ArgIdx++];
-    MVT VT = VA.getLocVT();
-
-    if (VA.isMemLoc()) {
-      VT = Ins[i].VT;
-      EVT MemVT = Splits[i].VT;
-      const unsigned Offset = 36 + VA.getLocMemOffset();
-      // The first 36 bytes of the input buffer contains information about
-      // thread group and global sizes.
-      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
-                                   Offset, Ins[i].Flags.isSExt());
-
-      const PointerType *ParamTy =
-        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
-      if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
-          ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-        // On SI local pointers are just offsets into LDS, so they are always
-        // less than 16-bits.  On CI and newer they could potentially be
-        // real pointers, so we can't guarantee their size.
-        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
-                          DAG.getValueType(MVT::i16));
-      }
-
-      InVals.push_back(Arg);
-      Info->ABIArgOffset = Offset + MemVT.getStoreSize();
-      continue;
-    }
-    assert(VA.isRegLoc() && "Parameter must be in a register!");
-
-    unsigned Reg = VA.getLocReg();
-
-    if (VT == MVT::i64) {
-      // For now assume it is a pointer
-      Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
-                                     &AMDGPU::SReg_64RegClass);
-      Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
-      InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
-      continue;
-    }
-
-    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
-
-    Reg = MF.addLiveIn(Reg, RC);
-    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
-
-    if (Arg.VT.isVector()) {
-
-      // Build a vector from the registers
-      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
-      unsigned NumElements = ParamType->getVectorNumElements();
-
-      SmallVector<SDValue, 4> Regs;
-      Regs.push_back(Val);
-      for (unsigned j = 1; j != NumElements; ++j) {
-        Reg = ArgLocs[ArgIdx++].getLocReg();
-        Reg = MF.addLiveIn(Reg, RC);
-        Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
-      }
-
-      // Fill up the missing vector elements
-      NumElements = Arg.VT.getVectorNumElements() - NumElements;
-      Regs.append(NumElements, DAG.getUNDEF(VT));
-
-      InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
-      continue;
-    }
-
-    InVals.push_back(Val);
-  }
-
-  if (Info->getShaderType() != ShaderType::COMPUTE) {
-    unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>(
-        AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
-    Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
-  }
-  return Chain;
-}
-
-MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
-    MachineInstr * MI, MachineBasicBlock * BB) const {
-
-  MachineBasicBlock::iterator I = *MI;
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
-  switch (MI->getOpcode()) {
-  default:
-    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
-  case AMDGPU::BRANCH:
-    return BB;
-  case AMDGPU::SI_RegisterStorePseudo: {
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-    MachineInstrBuilder MIB =
-        BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
-                Reg);
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
-      MIB.addOperand(MI->getOperand(i));
-
-    MI->eraseFromParent();
-    break;
-  }
-  }
-  return BB;
-}
-
-bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
-  // This currently forces unfolding various combinations of fsub into fma with
-  // free fneg'd operands. As long as we have fast FMA (controlled by
-  // isFMAFasterThanFMulAndFAdd), we should perform these.
-
-  // When fma is quarter rate, for f64 where add / sub are at best half rate,
-  // most of these combines appear to be cycle neutral but save on instruction
-  // count / code size.
-  return true;
-}
-
-EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const {
-  if (!VT.isVector()) {
-    return MVT::i1;
-  }
-  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
-}
-
-MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
-  return MVT::i32;
-}
-
-// Answering this is somewhat tricky and depends on the specific device which
-// have different rates for fma or all f64 operations.
-//
-// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
-// regardless of which device (although the number of cycles differs between
-// devices), so it is always profitable for f64.
-//
-// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
-// only on full rate devices. Normally, we should prefer selecting v_mad_f32
-// which we can always do even without fused FP ops since it returns the same
-// result as the separate operations and since it is always full
-// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
-// however does not support denormals, so we do report fma as faster if we have
-// a fast fma device and require denormals.
-//
-bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  VT = VT.getScalarType();
-
-  if (!VT.isSimple())
-    return false;
-
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::f32:
-    // This is as fast on some subtargets. However, we always have full rate f32
-    // mad available which returns the same result as the separate operations
-    // which we should prefer over fma. We can't use this if we want to support
-    // denormals, so only report this in these cases.
-    return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
-  case MVT::f64:
-    return true;
-  default:
-    break;
-  }
-
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-// Custom DAG Lowering Operations
-//===----------------------------------------------------------------------===//
-
-SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) {
-  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
-  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
-  case ISD::LOAD: {
-    SDValue Result = LowerLOAD(Op, DAG);
-    assert((!Result.getNode() ||
-            Result.getNode()->getNumValues() == 2) &&
-           "Load should return a value and a chain");
-    return Result;
-  }
-
-  case ISD::FSIN:
-  case ISD::FCOS:
-    return LowerTrig(Op, DAG);
-  case ISD::SELECT: return LowerSELECT(Op, DAG);
-  case ISD::FDIV: return LowerFDIV(Op, DAG);
-  case ISD::STORE: return LowerSTORE(Op, DAG);
-  case ISD::GlobalAddress: {
-    MachineFunction &MF = DAG.getMachineFunction();
-    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    return LowerGlobalAddress(MFI, Op, DAG);
-  }
-  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
-  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
-  }
-  return SDValue();
-}
-
-/// \brief Helper function for LowerBRCOND
-static SDNode *findUser(SDValue Value, unsigned Opcode) {
-
-  SDNode *Parent = Value.getNode();
-  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
-       I != E; ++I) {
-
-    if (I.getUse().get() != Value)
-      continue;
-
-    if (I->getOpcode() == Opcode)
-      return *I;
-  }
-  return nullptr;
-}
-
-SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
-
-  FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
-  unsigned FrameIndex = FINode->getIndex();
-
-  return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
-}
-
-/// This transforms the control flow intrinsics to get the branch destination as
-/// last parameter, also switches branch target with BR if the need arise
-SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
-                                      SelectionDAG &DAG) const {
-
-  SDLoc DL(BRCOND);
-
-  SDNode *Intr = BRCOND.getOperand(1).getNode();
-  SDValue Target = BRCOND.getOperand(2);
-  SDNode *BR = nullptr;
-
-  if (Intr->getOpcode() == ISD::SETCC) {
-    // As long as we negate the condition everything is fine
-    SDNode *SetCC = Intr;
-    assert(SetCC->getConstantOperandVal(1) == 1);
-    assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
-           ISD::SETNE);
-    Intr = SetCC->getOperand(0).getNode();
-
-  } else {
-    // Get the target from BR if we don't negate the condition
-    BR = findUser(BRCOND, ISD::BR);
-    Target = BR->getOperand(1);
-  }
-
-  assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
-
-  // Build the result and
-  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
-
-  // operands of the new intrinsic call
-  SmallVector<SDValue, 4> Ops;
-  Ops.push_back(BRCOND.getOperand(0));
-  Ops.append(Intr->op_begin() + 1, Intr->op_end());
-  Ops.push_back(Target);
-
-  // build the new intrinsic call
-  SDNode *Result = DAG.getNode(
-    Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
-    DAG.getVTList(Res), Ops).getNode();
-
-  if (BR) {
-    // Give the branch instruction our target
-    SDValue Ops[] = {
-      BR->getOperand(0),
-      BRCOND.getOperand(2)
-    };
-    SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
-    DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
-    BR = NewBR.getNode();
-  }
-
-  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
-
-  // Copy the intrinsic results to registers
-  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
-    SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
-    if (!CopyToReg)
-      continue;
-
-    Chain = DAG.getCopyToReg(
-      Chain, DL,
-      CopyToReg->getOperand(1),
-      SDValue(Result, i - 1),
-      SDValue());
-
-    DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
-  }
-
-  // Remove the old intrinsic from the chain
-  DAG.ReplaceAllUsesOfValueWith(
-    SDValue(Intr, Intr->getNumValues() - 1),
-    Intr->getOperand(0));
-
-  return Chain;
-}
-
-SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
-                                             SDValue Op,
-                                             SelectionDAG &DAG) const {
-  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
-
-  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
-    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
-
-  SDLoc DL(GSD);
-  const GlobalValue *GV = GSD->getGlobal();
-  MVT PtrVT = getPointerTy(GSD->getAddressSpace());
-
-  SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
-  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
-
-  SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
-                              DAG.getConstant(0, DL, MVT::i32));
-  SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
-                              DAG.getConstant(1, DL, MVT::i32));
-
-  SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
-                           PtrLo, GA);
-  SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
-                           PtrHi, DAG.getConstant(0, DL, MVT::i32),
-                           SDValue(Lo.getNode(), 1));
-  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
-}
-
-SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
-                                   SDValue V) const {
-  // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
-  // so we will end up with redundant moves to m0.
-  //
-  // We can't use S_MOV_B32, because there is no way to specify m0 as the
-  // destination register.
-  //
-  // We have to use them both.  Machine cse will combine all the S_MOV_B32
-  // instructions and the register coalescer eliminate the extra copies.
-  SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V);
-  return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32),
-                          SDValue(M0, 0), SDValue()); // Glue
-                                                      // A Null SDValue creates
-                                                      // a glue result.
-}
-
-SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
-
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-
-  switch (IntrinsicID) {
-  case Intrinsic::r600_read_ngroups_x:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::NGROUPS_X, false);
-  case Intrinsic::r600_read_ngroups_y:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::NGROUPS_Y, false);
-  case Intrinsic::r600_read_ngroups_z:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::NGROUPS_Z, false);
-  case Intrinsic::r600_read_global_size_x:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
-  case Intrinsic::r600_read_global_size_y:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
-  case Intrinsic::r600_read_global_size_z:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
-  case Intrinsic::r600_read_local_size_x:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_X, false);
-  case Intrinsic::r600_read_local_size_y:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_Y, false);
-  case Intrinsic::r600_read_local_size_z:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
-
-  case Intrinsic::AMDGPU_read_workdim:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset,
-                          false);
-
-  case Intrinsic::r600_read_tgid_x:
-    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
-  case Intrinsic::r600_read_tgid_y:
-    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT);
-  case Intrinsic::r600_read_tgid_z:
-    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
-  case Intrinsic::r600_read_tidig_x:
-    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
-  case Intrinsic::r600_read_tidig_y:
-    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
-  case Intrinsic::r600_read_tidig_z:
-    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
-  case AMDGPUIntrinsic::SI_load_const: {
-    SDValue Ops[] = {
-      Op.getOperand(1),
-      Op.getOperand(2)
-    };
-
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
-      VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
-  }
-  case AMDGPUIntrinsic::SI_sample:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
-  case AMDGPUIntrinsic::SI_sampleb:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
-  case AMDGPUIntrinsic::SI_sampled:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
-  case AMDGPUIntrinsic::SI_samplel:
-    return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
-  case AMDGPUIntrinsic::SI_vs_load_input:
-    return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
-                       Op.getOperand(1),
-                       Op.getOperand(2),
-                       Op.getOperand(3));
-
-  case AMDGPUIntrinsic::AMDGPU_fract:
-  case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
-    return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1),
-                       DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1)));
-  case AMDGPUIntrinsic::SI_fs_constant: {
-    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
-    SDValue Glue = M0.getValue(1);
-    return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
-                       DAG.getConstant(2, DL, MVT::i32), // P0
-                       Op.getOperand(1), Op.getOperand(2), Glue);
-  }
-  case AMDGPUIntrinsic::SI_fs_interp: {
-    SDValue IJ = Op.getOperand(4);
-    SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
-                            DAG.getConstant(0, DL, MVT::i32));
-    SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
-                            DAG.getConstant(1, DL, MVT::i32));
-    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
-    SDValue Glue = M0.getValue(1);
-    SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL,
-                             DAG.getVTList(MVT::f32, MVT::Glue),
-                             I, Op.getOperand(1), Op.getOperand(2), Glue);
-    Glue = SDValue(P1.getNode(), 1);
-    return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
-                             Op.getOperand(1), Op.getOperand(2), Glue);
-  }
-  default:
-    return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  }
-}
-
-SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
-                                              SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  SDLoc DL(Op);
-  SDValue Chain = Op.getOperand(0);
-  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-
-  switch (IntrinsicID) {
-  case AMDGPUIntrinsic::SI_sendmsg: {
-    Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
-    SDValue Glue = Chain.getValue(1);
-    return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
-                       Op.getOperand(2), Glue);
-  }
-  case AMDGPUIntrinsic::SI_tbuffer_store: {
-    SDValue Ops[] = {
-      Chain,
-      Op.getOperand(2),
-      Op.getOperand(3),
-      Op.getOperand(4),
-      Op.getOperand(5),
-      Op.getOperand(6),
-      Op.getOperand(7),
-      Op.getOperand(8),
-      Op.getOperand(9),
-      Op.getOperand(10),
-      Op.getOperand(11),
-      Op.getOperand(12),
-      Op.getOperand(13),
-      Op.getOperand(14)
-    };
-
-    EVT VT = Op.getOperand(3).getValueType();
-
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOStore,
-      VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
-  }
-  default:
-    return SDValue();
-  }
-}
-
-SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  LoadSDNode *Load = cast<LoadSDNode>(Op);
-
-  if (Op.getValueType().isVector()) {
-    assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
-           "Custom lowering for non-i32 vectors hasn't been implemented.");
-    unsigned NumElements = Op.getValueType().getVectorNumElements();
-    assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
-    switch (Load->getAddressSpace()) {
-      default: break;
-      case AMDGPUAS::GLOBAL_ADDRESS:
-      case AMDGPUAS::PRIVATE_ADDRESS:
-        // v4 loads are supported for private and global memory.
-        if (NumElements <= 4)
-          break;
-        // fall-through
-      case AMDGPUAS::LOCAL_ADDRESS:
-        return ScalarizeVectorLoad(Op, DAG);
-    }
-  }
-
-  return AMDGPUTargetLowering::LowerLOAD(Op, DAG);
-}
-
-SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
-                                               const SDValue &Op,
-                                               SelectionDAG &DAG) const {
-  return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
-                     Op.getOperand(2),
-                     Op.getOperand(3),
-                     Op.getOperand(4));
-}
-
-SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  if (Op.getValueType() != MVT::i64)
-    return SDValue();
-
-  SDLoc DL(Op);
-  SDValue Cond = Op.getOperand(0);
-
-  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
-  SDValue One = DAG.getConstant(1, DL, MVT::i32);
-
-  SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
-  SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
-
-  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
-  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
-
-  SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
-
-  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
-  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
-
-  SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
-
-  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi);
-  return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
-}
-
-// Catch division cases where we can use shortcuts with rcp and rsq
-// instructions.
-SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc SL(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  EVT VT = Op.getValueType();
-  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
-
-  if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
-    if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) &&
-        CLHS->isExactlyValue(1.0)) {
-      // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
-      // the CI documentation has a worst case error of 1 ulp.
-      // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
-      // use it as long as we aren't trying to use denormals.
-
-      // 1.0 / sqrt(x) -> rsq(x)
-      //
-      // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
-      // error seems really high at 2^29 ULP.
-      if (RHS.getOpcode() == ISD::FSQRT)
-        return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
-
-      // 1.0 / x -> rcp(x)
-      return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-    }
-  }
-
-  if (Unsafe) {
-    // Turn into multiply by the reciprocal.
-    // x / y -> x * (1.0 / y)
-    SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip);
-  }
-
-  return SDValue();
-}
-
-SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
-  SDValue FastLowered = LowerFastFDIV(Op, DAG);
-  if (FastLowered.getNode())
-    return FastLowered;
-
-  // This uses v_rcp_f32 which does not handle denormals. Let this hit a
-  // selection error for now rather than do something incorrect.
-  if (Subtarget->hasFP32Denormals())
-    return SDValue();
-
-  SDLoc SL(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-
-  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
-
-  const APFloat K0Val(BitsToFloat(0x6f800000));
-  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
-
-  const APFloat K1Val(BitsToFloat(0x2f800000));
-  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
-
-  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
-
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
-
-  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
-
-  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
-
-  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
-
-  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
-
-  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
-
-  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
-}
-
-SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
-  if (DAG.getTarget().Options.UnsafeFPMath)
-    return LowerFastFDIV(Op, DAG);
-
-  SDLoc SL(Op);
-  SDValue X = Op.getOperand(0);
-  SDValue Y = Op.getOperand(1);
-
-  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
-
-  SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
-
-  SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
-
-  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
-
-  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
-
-  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
-
-  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
-
-  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
-
-  SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
-
-  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
-  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
-
-  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
-                             NegDivScale0, Mul, DivScale1);
-
-  SDValue Scale;
-
-  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    // Workaround a hardware bug on SI where the condition output from div_scale
-    // is not usable.
-
-    const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
-
-    // Figure out if the scale to use for div_fmas.
-    SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
-    SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
-    SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
-    SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
-
-    SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
-    SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
-
-    SDValue Scale0Hi
-      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
-    SDValue Scale1Hi
-      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
-
-    SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
-    SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
-    Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
-  } else {
-    Scale = DivScale1.getValue(1);
-  }
-
-  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
-                             Fma4, Fma3, Mul, Scale);
-
-  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
-}
-
-SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-
-  if (VT == MVT::f32)
-    return LowerFDIV32(Op, DAG);
-
-  if (VT == MVT::f64)
-    return LowerFDIV64(Op, DAG);
-
-  llvm_unreachable("Unexpected type for fdiv");
-}
-
-SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  StoreSDNode *Store = cast<StoreSDNode>(Op);
-  EVT VT = Store->getMemoryVT();
-
-  // These stores are legal.
-  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
-    if (VT.isVector() && VT.getVectorNumElements() > 4)
-      return ScalarizeVectorStore(Op, DAG);
-    return SDValue();
-  }
-
-  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
-  if (Ret.getNode())
-    return Ret;
-
-  if (VT.isVector() && VT.getVectorNumElements() >= 8)
-      return ScalarizeVectorStore(Op, DAG);
-
-  if (VT == MVT::i1)
-    return DAG.getTruncStore(Store->getChain(), DL,
-                        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
-                        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
-
-  return SDValue();
-}
-
-SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  SDValue Arg = Op.getOperand(0);
-  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
-                                  DAG.getNode(ISD::FMUL, DL, VT, Arg,
-                                              DAG.getConstantFP(0.5/M_PI, DL,
-                                                                VT)));
-
-  switch (Op.getOpcode()) {
-  case ISD::FCOS:
-    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
-  case ISD::FSIN:
-    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
-  default:
-    llvm_unreachable("Wrong trig opcode");
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Custom DAG optimizations
-//===----------------------------------------------------------------------===//
-
-SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
-                                                     DAGCombinerInfo &DCI) const {
-  EVT VT = N->getValueType(0);
-  EVT ScalarVT = VT.getScalarType();
-  if (ScalarVT != MVT::f32)
-    return SDValue();
-
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-
-  SDValue Src = N->getOperand(0);
-  EVT SrcVT = Src.getValueType();
-
-  // TODO: We could try to match extracting the higher bytes, which would be
-  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
-  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
-  // about in practice.
-  if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
-    if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
-      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
-      DCI.AddToWorklist(Cvt.getNode());
-      return Cvt;
-    }
-  }
-
-  // We are primarily trying to catch operations on illegal vector types
-  // before they are expanded.
-  // For scalars, we can use the more flexible method of checking masked bits
-  // after legalization.
-  if (!DCI.isBeforeLegalize() ||
-      !SrcVT.isVector() ||
-      SrcVT.getVectorElementType() != MVT::i8) {
-    return SDValue();
-  }
-
-  assert(DCI.isBeforeLegalize() && "Unexpected legal type");
-
-  // Weird sized vectors are a pain to handle, but we know 3 is really the same
-  // size as 4.
-  unsigned NElts = SrcVT.getVectorNumElements();
-  if (!SrcVT.isSimple() && NElts != 3)
-    return SDValue();
-
-  // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
-  // prevent a mess from expanding to v4i32 and repacking.
-  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
-    EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
-    EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
-    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
-    LoadSDNode *Load = cast<LoadSDNode>(Src);
-
-    unsigned AS = Load->getAddressSpace();
-    unsigned Align = Load->getAlignment();
-    Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
-    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
-
-    // Don't try to replace the load if we have to expand it due to alignment
-    // problems. Otherwise we will end up scalarizing the load, and trying to
-    // repack into the vector for no real reason.
-    if (Align < ABIAlignment &&
-        !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) {
-      return SDValue();
-    }
-
-    SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
-                                     Load->getChain(),
-                                     Load->getBasePtr(),
-                                     LoadVT,
-                                     Load->getMemOperand());
-
-    // Make sure successors of the original load stay after it by updating
-    // them to use the new Chain.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
-
-    SmallVector<SDValue, 4> Elts;
-    if (RegVT.isVector())
-      DAG.ExtractVectorElements(NewLoad, Elts);
-    else
-      Elts.push_back(NewLoad);
-
-    SmallVector<SDValue, 4> Ops;
-
-    unsigned EltIdx = 0;
-    for (SDValue Elt : Elts) {
-      unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
-      for (unsigned I = 0; I < ComponentsInElt; ++I) {
-        unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
-        SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
-        DCI.AddToWorklist(Cvt.getNode());
-        Ops.push_back(Cvt);
-      }
-
-      ++EltIdx;
-    }
-
-    assert(Ops.size() == NElts);
-
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops);
-  }
-
-  return SDValue();
-}
-
-/// \brief Return true if the given offset Size in bytes can be folded into
-/// the immediate offsets of a memory instruction for the given address space.
-static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
-                          const AMDGPUSubtarget &STI) {
-  switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS: {
-    // MUBUF instructions a 12-bit offset in bytes.
-    return isUInt<12>(OffsetSize);
-  }
-  case AMDGPUAS::CONSTANT_ADDRESS: {
-    // SMRD instructions have an 8-bit offset in dwords on SI and
-    // a 20-bit offset in bytes on VI.
-    if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-      return isUInt<20>(OffsetSize);
-    else
-      return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
-  }
-  case AMDGPUAS::LOCAL_ADDRESS:
-  case AMDGPUAS::REGION_ADDRESS: {
-    // The single offset versions have a 16-bit offset in bytes.
-    return isUInt<16>(OffsetSize);
-  }
-  case AMDGPUAS::PRIVATE_ADDRESS:
-  // Indirect register addressing does not use any offsets.
-  default:
-    return 0;
-  }
-}
-
-// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
-
-// This is a variant of
-// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
-//
-// The normal DAG combiner will do this, but only if the add has one use since
-// that would increase the number of instructions.
-//
-// This prevents us from seeing a constant offset that can be folded into a
-// memory instruction's addressing mode. If we know the resulting add offset of
-// a pointer can be folded into an addressing offset, we can replace the pointer
-// operand with the add of new constant offset. This eliminates one of the uses,
-// and may allow the remaining use to also be simplified.
-//
-SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
-                                               unsigned AddrSpace,
-                                               DAGCombinerInfo &DCI) const {
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  if (N0.getOpcode() != ISD::ADD)
-    return SDValue();
-
-  const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
-  if (!CN1)
-    return SDValue();
-
-  const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
-  if (!CAdd)
-    return SDValue();
-
-  // If the resulting offset is too large, we can't fold it into the addressing
-  // mode offset.
-  APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
-  if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget))
-    return SDValue();
-
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc SL(N);
-  EVT VT = N->getValueType(0);
-
-  SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
-  SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
-
-  return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
-}
-
-SDValue SITargetLowering::performAndCombine(SDNode *N,
-                                            DAGCombinerInfo &DCI) const {
-  if (DCI.isBeforeLegalize())
-    return SDValue();
-
-  SelectionDAG &DAG = DCI.DAG;
-
-  // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
-  // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-
-  if (LHS.getOpcode() == ISD::SETCC &&
-      RHS.getOpcode() == ISD::SETCC) {
-    ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
-    ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
-
-    SDValue X = LHS.getOperand(0);
-    SDValue Y = RHS.getOperand(0);
-    if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
-      return SDValue();
-
-    if (LCC == ISD::SETO) {
-      if (X != LHS.getOperand(1))
-        return SDValue();
-
-      if (RCC == ISD::SETUNE) {
-        const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
-        if (!C1 || !C1->isInfinity() || C1->isNegative())
-          return SDValue();
-
-        const uint32_t Mask = SIInstrFlags::N_NORMAL |
-                              SIInstrFlags::N_SUBNORMAL |
-                              SIInstrFlags::N_ZERO |
-                              SIInstrFlags::P_ZERO |
-                              SIInstrFlags::P_SUBNORMAL |
-                              SIInstrFlags::P_NORMAL;
-
-        static_assert(((~(SIInstrFlags::S_NAN |
-                          SIInstrFlags::Q_NAN |
-                          SIInstrFlags::N_INFINITY |
-                          SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
-                      "mask not equal");
-
-        SDLoc DL(N);
-        return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
-                           X, DAG.getConstant(Mask, DL, MVT::i32));
-      }
-    }
-  }
-
-  return SDValue();
-}
-
-SDValue SITargetLowering::performOrCombine(SDNode *N,
-                                           DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-
-  // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
-  if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
-      RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
-    SDValue Src = LHS.getOperand(0);
-    if (Src != RHS.getOperand(0))
-      return SDValue();
-
-    const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
-    const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
-    if (!CLHS || !CRHS)
-      return SDValue();
-
-    // Only 10 bits are used.
-    static const uint32_t MaxMask = 0x3ff;
-
-    uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
-    SDLoc DL(N);
-    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
-                       Src, DAG.getConstant(NewMask, DL, MVT::i32));
-  }
-
-  return SDValue();
-}
-
-SDValue SITargetLowering::performClassCombine(SDNode *N,
-                                              DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
-  SDValue Mask = N->getOperand(1);
-
-  // fp_class x, 0 -> false
-  if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
-    if (CMask->isNullValue())
-      return DAG.getConstant(0, SDLoc(N), MVT::i1);
-  }
-
-  return SDValue();
-}
-
-static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
-  switch (Opc) {
-  case ISD::FMAXNUM:
-    return AMDGPUISD::FMAX3;
-  case ISD::SMAX:
-    return AMDGPUISD::SMAX3;
-  case ISD::UMAX:
-    return AMDGPUISD::UMAX3;
-  case ISD::FMINNUM:
-    return AMDGPUISD::FMIN3;
-  case ISD::SMIN:
-    return AMDGPUISD::SMIN3;
-  case ISD::UMIN:
-    return AMDGPUISD::UMIN3;
-  default:
-    llvm_unreachable("Not a min/max opcode");
-  }
-}
-
-SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
-                                                 DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
-
-  unsigned Opc = N->getOpcode();
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-
-  // Only do this if the inner op has one use since this will just increases
-  // register pressure for no benefit.
-
-  // max(max(a, b), c)
-  if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
-    SDLoc DL(N);
-    return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
-                       DL,
-                       N->getValueType(0),
-                       Op0.getOperand(0),
-                       Op0.getOperand(1),
-                       Op1);
-  }
-
-  // max(a, max(b, c))
-  if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
-    SDLoc DL(N);
-    return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
-                       DL,
-                       N->getValueType(0),
-                       Op0,
-                       Op1.getOperand(0),
-                       Op1.getOperand(1));
-  }
-
-  return SDValue();
-}
-
-SDValue SITargetLowering::performSetCCCombine(SDNode *N,
-                                              DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc SL(N);
-
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-  EVT VT = LHS.getValueType();
-
-  if (VT != MVT::f32 && VT != MVT::f64)
-    return SDValue();
-
-  // Match isinf pattern
-  // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
-  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
-  if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
-    const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
-    if (!CRHS)
-      return SDValue();
-
-    const APFloat &APF = CRHS->getValueAPF();
-    if (APF.isInfinity() && !APF.isNegative()) {
-      unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
-      return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
-                         DAG.getConstant(Mask, SL, MVT::i32));
-    }
-  }
-
-  return SDValue();
-}
-
-SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
-                                            DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-
-  switch (N->getOpcode()) {
-  default:
-    return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
-  case ISD::SETCC:
-    return performSetCCCombine(N, DCI);
-  case ISD::FMAXNUM: // TODO: What about fmax_legacy?
-  case ISD::FMINNUM:
-  case ISD::SMAX:
-  case ISD::SMIN:
-  case ISD::UMAX:
-  case ISD::UMIN: {
-    if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
-        N->getValueType(0) != MVT::f64 &&
-        getTargetMachine().getOptLevel() > CodeGenOpt::None)
-      return performMin3Max3Combine(N, DCI);
-    break;
-  }
-
-  case AMDGPUISD::CVT_F32_UBYTE0:
-  case AMDGPUISD::CVT_F32_UBYTE1:
-  case AMDGPUISD::CVT_F32_UBYTE2:
-  case AMDGPUISD::CVT_F32_UBYTE3: {
-    unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
-
-    SDValue Src = N->getOperand(0);
-    APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
-
-    APInt KnownZero, KnownOne;
-    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
-                                          !DCI.isBeforeLegalizeOps());
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
-        TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
-      DCI.CommitTargetLoweringOpt(TLO);
-    }
-
-    break;
-  }
-
-  case ISD::UINT_TO_FP: {
-    return performUCharToFloatCombine(N, DCI);
-
-  case ISD::FADD: {
-    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
-      break;
-
-    EVT VT = N->getValueType(0);
-    if (VT != MVT::f32)
-      break;
-
-    // Only do this if we are not trying to support denormals. v_mad_f32 does
-    // not support denormals ever.
-    if (Subtarget->hasFP32Denormals())
-      break;
-
-    SDValue LHS = N->getOperand(0);
-    SDValue RHS = N->getOperand(1);
-
-    // These should really be instruction patterns, but writing patterns with
-    // source modiifiers is a pain.
-
-    // fadd (fadd (a, a), b) -> mad 2.0, a, b
-    if (LHS.getOpcode() == ISD::FADD) {
-      SDValue A = LHS.getOperand(0);
-      if (A == LHS.getOperand(1)) {
-        const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
-        return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS);
-      }
-    }
-
-    // fadd (b, fadd (a, a)) -> mad 2.0, a, b
-    if (RHS.getOpcode() == ISD::FADD) {
-      SDValue A = RHS.getOperand(0);
-      if (A == RHS.getOperand(1)) {
-        const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
-        return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS);
-      }
-    }
-
-    return SDValue();
-  }
-  case ISD::FSUB: {
-    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
-      break;
-
-    EVT VT = N->getValueType(0);
-
-    // Try to get the fneg to fold into the source modifier. This undoes generic
-    // DAG combines and folds them into the mad.
-    //
-    // Only do this if we are not trying to support denormals. v_mad_f32 does
-    // not support denormals ever.
-    if (VT == MVT::f32 &&
-        !Subtarget->hasFP32Denormals()) {
-      SDValue LHS = N->getOperand(0);
-      SDValue RHS = N->getOperand(1);
-      if (LHS.getOpcode() == ISD::FADD) {
-        // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
-
-        SDValue A = LHS.getOperand(0);
-        if (A == LHS.getOperand(1)) {
-          const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
-          SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS);
-
-          return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS);
-        }
-      }
-
-      if (RHS.getOpcode() == ISD::FADD) {
-        // (fsub c, (fadd a, a)) -> mad -2.0, a, c
-
-        SDValue A = RHS.getOperand(0);
-        if (A == RHS.getOperand(1)) {
-          const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32);
-          return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS);
-        }
-      }
-
-      return SDValue();
-    }
-
-    break;
-  }
-  }
-  case ISD::LOAD:
-  case ISD::STORE:
-  case ISD::ATOMIC_LOAD:
-  case ISD::ATOMIC_STORE:
-  case ISD::ATOMIC_CMP_SWAP:
-  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
-  case ISD::ATOMIC_SWAP:
-  case ISD::ATOMIC_LOAD_ADD:
-  case ISD::ATOMIC_LOAD_SUB:
-  case ISD::ATOMIC_LOAD_AND:
-  case ISD::ATOMIC_LOAD_OR:
-  case ISD::ATOMIC_LOAD_XOR:
-  case ISD::ATOMIC_LOAD_NAND:
-  case ISD::ATOMIC_LOAD_MIN:
-  case ISD::ATOMIC_LOAD_MAX:
-  case ISD::ATOMIC_LOAD_UMIN:
-  case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics.
-    if (DCI.isBeforeLegalize())
-      break;
-
-    MemSDNode *MemNode = cast<MemSDNode>(N);
-    SDValue Ptr = MemNode->getBasePtr();
-
-    // TODO: We could also do this for multiplies.
-    unsigned AS = MemNode->getAddressSpace();
-    if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
-      SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
-      if (NewPtr) {
-        SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end());
-
-        NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
-        return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
-      }
-    }
-    break;
-  }
-  case ISD::AND:
-    return performAndCombine(N, DCI);
-  case ISD::OR:
-    return performOrCombine(N, DCI);
-  case AMDGPUISD::FP_CLASS:
-    return performClassCombine(N, DCI);
-  }
-  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
-}
-
-/// \brief Analyze the possible immediate value Op
-///
-/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
-/// and the immediate value if it's a literal immediate
-int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
-
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
-  if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
-    if (TII->isInlineConstant(Node->getAPIntValue()))
-      return 0;
-
-    uint64_t Val = Node->getZExtValue();
-    return isUInt<32>(Val) ? Val : -1;
-  }
-
-  if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
-    if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt()))
-      return 0;
-
-    if (Node->getValueType(0) == MVT::f32)
-      return FloatToBits(Node->getValueAPF().convertToFloat());
-
-    return -1;
-  }
-
-  return -1;
-}
-
-/// \brief Helper function for adjustWritemask
-static unsigned SubIdx2Lane(unsigned Idx) {
-  switch (Idx) {
-  default: return 0;
-  case AMDGPU::sub0: return 0;
-  case AMDGPU::sub1: return 1;
-  case AMDGPU::sub2: return 2;
-  case AMDGPU::sub3: return 3;
-  }
-}
-
-/// \brief Adjust the writemask of MIMG instructions
-void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
-                                       SelectionDAG &DAG) const {
-  SDNode *Users[4] = { };
-  unsigned Lane = 0;
-  unsigned OldDmask = Node->getConstantOperandVal(0);
-  unsigned NewDmask = 0;
-
-  // Try to figure out the used register components
-  for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
-       I != E; ++I) {
-
-    // Abort if we can't understand the usage
-    if (!I->isMachineOpcode() ||
-        I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
-      return;
-
-    // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
-    // Note that subregs are packed, i.e. Lane==0 is the first bit set
-    // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
-    // set, etc.
-    Lane = SubIdx2Lane(I->getConstantOperandVal(1));
-
-    // Set which texture component corresponds to the lane.
-    unsigned Comp;
-    for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
-      assert(Dmask);
-      Comp = countTrailingZeros(Dmask);
-      Dmask &= ~(1 << Comp);
-    }
-
-    // Abort if we have more than one user per component
-    if (Users[Lane])
-      return;
-
-    Users[Lane] = *I;
-    NewDmask |= 1 << Comp;
-  }
-
-  // Abort if there's no change
-  if (NewDmask == OldDmask)
-    return;
-
-  // Adjust the writemask in the node
-  std::vector<SDValue> Ops;
-  Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
-  Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end());
-  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
-
-  // If we only got one lane, replace it with a copy
-  // (if NewDmask has only one bit set...)
-  if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
-    SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
-                                       MVT::i32);
-    SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
-                                      SDLoc(), Users[Lane]->getValueType(0),
-                                      SDValue(Node, 0), RC);
-    DAG.ReplaceAllUsesWith(Users[Lane], Copy);
-    return;
-  }
-
-  // Update the users of the node with the new indices
-  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
-
-    SDNode *User = Users[i];
-    if (!User)
-      continue;
-
-    SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
-    DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
-
-    switch (Idx) {
-    default: break;
-    case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
-    case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
-    case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
-    }
-  }
-}
-
-/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
-/// with frame index operands.
-/// LLVM assumes that inputs are to these instructions are registers.
-void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
-                                                     SelectionDAG &DAG) const {
-
-  SmallVector<SDValue, 8> Ops;
-  for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
-    if (!isa<FrameIndexSDNode>(Node->getOperand(i))) {
-      Ops.push_back(Node->getOperand(i));
-      continue;
-    }
-
-    SDLoc DL(Node);
-    Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
-                                     Node->getOperand(i).getValueType(),
-                                     Node->getOperand(i)), 0));
-  }
-
-  DAG.UpdateNodeOperands(Node, Ops);
-}
-
-/// \brief Fold the instructions after selecting them.
-SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
-                                          SelectionDAG &DAG) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
-  if (TII->isMIMG(Node->getMachineOpcode()))
-    adjustWritemask(Node, DAG);
-
-  if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG ||
-      Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) {
-    legalizeTargetIndependentNode(Node, DAG);
-    return Node;
-  }
-  return Node;
-}
-
-/// \brief Assign the register class depending on the number of
-/// bits set in the writemask
-void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
-                                                     SDNode *Node) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  TII->legalizeOperands(MI);
-
-  if (TII->isMIMG(MI->getOpcode())) {
-    unsigned VReg = MI->getOperand(0).getReg();
-    unsigned Writemask = MI->getOperand(1).getImm();
-    unsigned BitsSet = 0;
-    for (unsigned i = 0; i < 4; ++i)
-      BitsSet += Writemask & (1 << i) ? 1 : 0;
-
-    const TargetRegisterClass *RC;
-    switch (BitsSet) {
-    default: return;
-    case 1:  RC = &AMDGPU::VGPR_32RegClass; break;
-    case 2:  RC = &AMDGPU::VReg_64RegClass; break;
-    case 3:  RC = &AMDGPU::VReg_96RegClass; break;
-    }
-
-    unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
-    MI->setDesc(TII->get(NewOpcode));
-    MRI.setRegClass(VReg, RC);
-    return;
-  }
-
-  // Replace unused atomics with the no return version.
-  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode());
-  if (NoRetAtomicOp != -1) {
-    if (!Node->hasAnyUseOfValue(0)) {
-      MI->setDesc(TII->get(NoRetAtomicOp));
-      MI->RemoveOperand(0);
-    }
-
-    return;
-  }
-}
-
-static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) {
-  SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
-  return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
-}
-
-MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
-                                                SDLoc DL,
-                                                SDValue Ptr) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-#if 1
-    // XXX - Workaround for moveToVALU not handling different register class
-    // inserts for REG_SEQUENCE.
-
-    // Build the half of the subregister with the constants.
-    const SDValue Ops0[] = {
-      DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
-      buildSMovImm32(DAG, DL, 0),
-      DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
-      DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
-    };
-
-    SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                                  MVT::v2i32, Ops0), 0);
-
-    // Combine the constants and the pointer.
-    const SDValue Ops1[] = {
-      DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
-      Ptr,
-      DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
-      SubRegHi,
-      DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
-    };
-
-    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
-#else
-    const SDValue Ops[] = {
-      DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
-      Ptr,
-      DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
-      buildSMovImm32(DAG, DL, 0),
-      DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
-      buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32),
-      DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
-    };
-
-    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
-
-#endif
-}
-
-/// \brief Return a resource descriptor with the 'Add TID' bit enabled
-///        The TID (Thread ID) is multipled by the stride value (bits [61:48]
-///        of the resource descriptor) to create an offset, which is added to the
-///        resource ponter.
-MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
-                                           SDLoc DL,
-                                           SDValue Ptr,
-                                           uint32_t RsrcDword1,
-                                           uint64_t RsrcDword2And3) const {
-  SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
-  SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
-  if (RsrcDword1) {
-    PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
-                                     DAG.getConstant(RsrcDword1, DL, MVT::i32)),
-                    0);
-  }
-
-  SDValue DataLo = buildSMovImm32(DAG, DL,
-                                  RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
-  SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
-
-  const SDValue Ops[] = {
-    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
-    PtrLo,
-    DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-    PtrHi,
-    DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
-    DataLo,
-    DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
-    DataHi,
-    DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
-  };
-
-  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
-}
-
-MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
-                                                  SDLoc DL,
-                                                  SDValue Ptr) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-  uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE |
-                  0xffffffff; // Size
-
-  return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
-}
-
-SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
-                                               const TargetRegisterClass *RC,
-                                               unsigned Reg, EVT VT) const {
-  SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
-
-  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
-                            cast<RegisterSDNode>(VReg)->getReg(), VT);
-}
-
-//===----------------------------------------------------------------------===//
-//                         SI Inline Assembly Support
-//===----------------------------------------------------------------------===//
-
-std::pair<unsigned, const TargetRegisterClass *>
-SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                               const std::string &Constraint,
-                                               MVT VT) const {
-  if (Constraint == "r") {
-    switch(VT.SimpleTy) {
-      default: llvm_unreachable("Unhandled type for 'r' inline asm constraint");
-      case MVT::i64:
-        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
-      case MVT::i32:
-        return std::make_pair(0U, &AMDGPU::SGPR_32RegClass);
-    }
-  }
-
-  if (Constraint.size() > 1) {
-    const TargetRegisterClass *RC = nullptr;
-    if (Constraint[1] == 'v') {
-      RC = &AMDGPU::VGPR_32RegClass;
-    } else if (Constraint[1] == 's') {
-      RC = &AMDGPU::SGPR_32RegClass;
-    }
-
-    if (RC) {
-      unsigned Idx = std::atoi(Constraint.substr(2).c_str());
-      if (Idx < RC->getNumRegs())
-        return std::make_pair(RC->getRegister(Idx), RC);
-    }
-  }
-  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
-}
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
deleted file mode 100644
index a956b013bdb..00000000000
--- a/lib/Target/R600/SIISelLowering.h
+++ /dev/null
@@ -1,125 +0,0 @@
-//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief SI DAG Lowering interface definition
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H
-#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H
-
-#include "AMDGPUISelLowering.h"
-#include "SIInstrInfo.h"
-
-namespace llvm {
-
-class SITargetLowering : public AMDGPUTargetLowering {
-  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL,
-                         SDValue Chain, unsigned Offset, bool Signed) const;
-  SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
-                               SelectionDAG &DAG) const;
-  SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
-                             SelectionDAG &DAG) const override;
-
-  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;
-  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
-
-  void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
-
-  SDValue performUCharToFloatCombine(SDNode *N,
-                                     DAGCombinerInfo &DCI) const;
-  SDValue performSHLPtrCombine(SDNode *N,
-                               unsigned AS,
-                               DAGCombinerInfo &DCI) const;
-  SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-
-  SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
-  SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-
-public:
-  SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
-
-  bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
-                          EVT /*VT*/) const override;
-
-  bool isLegalAddressingMode(const AddrMode &AM,
-                             Type *Ty, unsigned AS) const override;
-
-  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
-                                      unsigned Align,
-                                      bool *IsFast) const override;
-
-  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
-                          unsigned SrcAlign, bool IsMemset,
-                          bool ZeroMemset,
-                          bool MemcpyStrSrc,
-                          MachineFunction &MF) const override;
-
-  TargetLoweringBase::LegalizeTypeAction
-  getPreferredVectorAction(EVT VT) const override;
-
-  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                        Type *Ty) const override;
-
-  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
-                               bool isVarArg,
-                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                               SDLoc DL, SelectionDAG &DAG,
-                               SmallVectorImpl<SDValue> &InVals) const override;
-
-  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
-                                      MachineBasicBlock * BB) const override;
-  bool enableAggressiveFMAFusion(EVT VT) const override;
-  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
-  MVT getScalarShiftAmountTy(EVT VT) const override;
-  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
-  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
-  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
-  SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
-  void AdjustInstrPostInstrSelection(MachineInstr *MI,
-                                     SDNode *Node) const override;
-
-  int32_t analyzeImmediate(const SDNode *N) const;
-  SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
-                               unsigned Reg, EVT VT) const override;
-  void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
-
-  MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const;
-  MachineSDNode *buildRSRC(SelectionDAG &DAG,
-                           SDLoc DL,
-                           SDValue Ptr,
-                           uint32_t RsrcDword1,
-                           uint64_t RsrcDword2And3) const;
-  MachineSDNode *buildScratchRSRC(SelectionDAG &DAG,
-                                  SDLoc DL,
-                                  SDValue Ptr) const;
-
-  std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(
-                                   const TargetRegisterInfo *TRI,
-                                   const std::string &Constraint, MVT VT) const override;
-  SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const;
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
deleted file mode 100644
index 90a37f17468..00000000000
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ /dev/null
@@ -1,480 +0,0 @@
-//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Insert wait instructions for memory reads and writes.
-///
-/// Memory reads and writes are issued asynchronously, so we need to insert
-/// S_WAITCNT instructions when we want to access any of their results or
-/// overwrite any register that's used asynchronously.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
-using namespace llvm;
-
-namespace {
-
-/// \brief One variable for each of the hardware counters
-typedef union {
-  struct {
-    unsigned VM;
-    unsigned EXP;
-    unsigned LGKM;
-  } Named;
-  unsigned Array[3];
-
-} Counters;
-
-typedef enum {
-  OTHER,
-  SMEM,
-  VMEM
-} InstType;
-
-typedef Counters RegCounters[512];
-typedef std::pair<unsigned, unsigned> RegInterval;
-
-class SIInsertWaits : public MachineFunctionPass {
-
-private:
-  static char ID;
-  const SIInstrInfo *TII;
-  const SIRegisterInfo *TRI;
-  const MachineRegisterInfo *MRI;
-
-  /// \brief Constant hardware limits
-  static const Counters WaitCounts;
-
-  /// \brief Constant zero value
-  static const Counters ZeroCounts;
-
-  /// \brief Counter values we have already waited on.
-  Counters WaitedOn;
-
-  /// \brief Counter values for last instruction issued.
-  Counters LastIssued;
-
-  /// \brief Registers used by async instructions.
-  RegCounters UsedRegs;
-
-  /// \brief Registers defined by async instructions.
-  RegCounters DefinedRegs;
-
-  /// \brief Different export instruction types seen since last wait.
-  unsigned ExpInstrTypesSeen;
-
-  /// \brief Type of the last opcode.
-  InstType LastOpcodeType;
-
-  bool LastInstWritesM0;
-
-  /// \brief Get increment/decrement amount for this instruction.
-  Counters getHwCounts(MachineInstr &MI);
-
-  /// \brief Is operand relevant for async execution?
-  bool isOpRelevant(MachineOperand &Op);
-
-  /// \brief Get register interval an operand affects.
-  RegInterval getRegInterval(MachineOperand &Op);
-
-  /// \brief Handle instructions async components
-  void pushInstruction(MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator I);
-
-  /// \brief Insert the actual wait instruction
-  bool insertWait(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator I,
-                  const Counters &Counts);
-
-  /// \brief Do we need def2def checks?
-  bool unorderedDefines(MachineInstr &MI);
-
-  /// \brief Resolve all operand dependencies to counter requirements
-  Counters handleOperands(MachineInstr &MI);
-
-  /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
-  void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
-
-public:
-  SIInsertWaits(TargetMachine &tm) :
-    MachineFunctionPass(ID),
-    TII(nullptr),
-    TRI(nullptr),
-    ExpInstrTypesSeen(0) { }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI insert wait  instructions";
-  }
-
-};
-
-} // End anonymous namespace
-
-char SIInsertWaits::ID = 0;
-
-const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
-const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
-
-FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
-  return new SIInsertWaits(tm);
-}
-
-Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
-
-  uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
-  Counters Result;
-
-  Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
-
-  // Only consider stores or EXP for EXP_CNT
-  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
-      (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
-
-  // LGKM may uses larger values
-  if (TSFlags & SIInstrFlags::LGKM_CNT) {
-
-    if (TII->isSMRD(MI.getOpcode())) {
-
-      MachineOperand &Op = MI.getOperand(0);
-      assert(Op.isReg() && "First LGKM operand must be a register!");
-
-      unsigned Reg = Op.getReg();
-      unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
-      Result.Named.LGKM = Size > 4 ? 2 : 1;
-
-    } else {
-      // DS
-      Result.Named.LGKM = 1;
-    }
-
-  } else {
-    Result.Named.LGKM = 0;
-  }
-
-  return Result;
-}
-
-bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
-
-  // Constants are always irrelevant
-  if (!Op.isReg())
-    return false;
-
-  // Defines are always relevant
-  if (Op.isDef())
-    return true;
-
-  // For exports all registers are relevant
-  MachineInstr &MI = *Op.getParent();
-  if (MI.getOpcode() == AMDGPU::EXP)
-    return true;
-
-  // For stores the stored value is also relevant
-  if (!MI.getDesc().mayStore())
-    return false;
-
-  // Check if this operand is the value being stored.
-  // Special case for DS instructions, since the address
-  // operand comes before the value operand and it may have
-  // multiple data operands.
-
-  if (TII->isDS(MI.getOpcode())) {
-    MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
-    if (Data && Op.isIdenticalTo(*Data))
-      return true;
-
-    MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
-    if (Data0 && Op.isIdenticalTo(*Data0))
-      return true;
-
-    MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
-    if (Data1 && Op.isIdenticalTo(*Data1))
-      return true;
-
-    return false;
-  }
-
-  // NOTE: This assumes that the value operand is before the
-  // address operand, and that there is only one value operand.
-  for (MachineInstr::mop_iterator I = MI.operands_begin(),
-       E = MI.operands_end(); I != E; ++I) {
-
-    if (I->isReg() && I->isUse())
-      return Op.isIdenticalTo(*I);
-  }
-
-  return false;
-}
-
-RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
-
-  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
-    return std::make_pair(0, 0);
-
-  unsigned Reg = Op.getReg();
-  unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
-
-  assert(Size >= 4);
-
-  RegInterval Result;
-  Result.first = TRI->getEncodingValue(Reg);
-  Result.second = Result.first + Size / 4;
-
-  return Result;
-}
-
-void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator I) {
-
-  // Get the hardware counter increments and sum them up
-  Counters Increment = getHwCounts(*I);
-  unsigned Sum = 0;
-
-  for (unsigned i = 0; i < 3; ++i) {
-    LastIssued.Array[i] += Increment.Array[i];
-    Sum += Increment.Array[i];
-  }
-
-  // If we don't increase anything then that's it
-  if (Sum == 0) {
-    LastOpcodeType = OTHER;
-    return;
-  }
-
-  if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
-      AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM
-    // or SMEM clause, respectively.
-    //
-    // The temporary workaround is to break the clauses with S_NOP.
-    //
-    // The proper solution would be to allocate registers such that all source
-    // and destination registers don't overlap, e.g. this is illegal:
-    //   r0 = load r2
-    //   r2 = load r0
-    if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) ||
-        (LastOpcodeType == VMEM && Increment.Named.VM)) {
-      // Insert a NOP to break the clause.
-      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
-          .addImm(0);
-      LastInstWritesM0 = false;
-    }
-
-    if (TII->isSMRD(I->getOpcode()))
-      LastOpcodeType = SMEM;
-    else if (Increment.Named.VM)
-      LastOpcodeType = VMEM;
-  }
-
-  // Remember which export instructions we have seen
-  if (Increment.Named.EXP) {
-    ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
-  }
-
-  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-
-    MachineOperand &Op = I->getOperand(i);
-    if (!isOpRelevant(Op))
-      continue;
-
-    RegInterval Interval = getRegInterval(Op);
-    for (unsigned j = Interval.first; j < Interval.second; ++j) {
-
-      // Remember which registers we define
-      if (Op.isDef())
-        DefinedRegs[j] = LastIssued;
-
-      // and which one we are using
-      if (Op.isUse())
-        UsedRegs[j] = LastIssued;
-    }
-  }
-}
-
-bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I,
-                               const Counters &Required) {
-
-  // End of program? No need to wait on anything
-  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
-    return false;
-
-  // Figure out if the async instructions execute in order
-  bool Ordered[3];
-
-  // VM_CNT is always ordered
-  Ordered[0] = true;
-
-  // EXP_CNT is unordered if we have both EXP & VM-writes
-  Ordered[1] = ExpInstrTypesSeen == 3;
-
-  // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
-  Ordered[2] = false;
-
-  // The values we are going to put into the S_WAITCNT instruction
-  Counters Counts = WaitCounts;
-
-  // Do we really need to wait?
-  bool NeedWait = false;
-
-  for (unsigned i = 0; i < 3; ++i) {
-
-    if (Required.Array[i] <= WaitedOn.Array[i])
-      continue;
-
-    NeedWait = true;
-
-    if (Ordered[i]) {
-      unsigned Value = LastIssued.Array[i] - Required.Array[i];
-
-      // Adjust the value to the real hardware possibilities.
-      Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
-
-    } else
-      Counts.Array[i] = 0;
-
-    // Remember on what we have waited on.
-    WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
-  }
-
-  if (!NeedWait)
-    return false;
-
-  // Reset EXP_CNT instruction types
-  if (Counts.Named.EXP == 0)
-    ExpInstrTypesSeen = 0;
-
-  // Build the wait instruction
-  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-          .addImm((Counts.Named.VM & 0xF) |
-                  ((Counts.Named.EXP & 0x7) << 4) |
-                  ((Counts.Named.LGKM & 0x7) << 8));
-
-  LastOpcodeType = OTHER;
-  LastInstWritesM0 = false;
-  return true;
-}
-
-/// \brief helper function for handleOperands
-static void increaseCounters(Counters &Dst, const Counters &Src) {
-
-  for (unsigned i = 0; i < 3; ++i)
-    Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
-}
-
-Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
-
-  Counters Result = ZeroCounts;
-
-  // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
-  // but we also want to wait for any other outstanding transfers before
-  // signalling other hardware blocks
-  if (MI.getOpcode() == AMDGPU::S_SENDMSG)
-    return LastIssued;
-
-  // For each register affected by this
-  // instruction increase the result sequence
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-
-    MachineOperand &Op = MI.getOperand(i);
-    RegInterval Interval = getRegInterval(Op);
-    for (unsigned j = Interval.first; j < Interval.second; ++j) {
-
-      if (Op.isDef()) {
-        increaseCounters(Result, UsedRegs[j]);
-        increaseCounters(Result, DefinedRegs[j]);
-      }
-
-      if (Op.isUse())
-        increaseCounters(Result, DefinedRegs[j]);
-    }
-  }
-
-  return Result;
-}
-
-void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I) {
-  if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <
-      AMDGPUSubtarget::VOLCANIC_ISLANDS)
-    return;
-
-  // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
-  if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
-    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
-    LastInstWritesM0 = false;
-    return;
-  }
-
-  // Set whether this instruction sets M0
-  LastInstWritesM0 = false;
-
-  unsigned NumOperands = I->getNumOperands();
-  for (unsigned i = 0; i < NumOperands; i++) {
-    const MachineOperand &Op = I->getOperand(i);
-
-    if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
-      LastInstWritesM0 = true;
-  }
-}
-
-// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
-// around other non-memory instructions.
-bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
-  bool Changes = false;
-
-  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  TRI =
-      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
-
-  MRI = &MF.getRegInfo();
-
-  WaitedOn = ZeroCounts;
-  LastIssued = ZeroCounts;
-  LastOpcodeType = OTHER;
-  LastInstWritesM0 = false;
-
-  memset(&UsedRegs, 0, sizeof(UsedRegs));
-  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-
-      // Wait for everything before a barrier.
-      if (I->getOpcode() == AMDGPU::S_BARRIER)
-        Changes |= insertWait(MBB, I, LastIssued);
-      else
-        Changes |= insertWait(MBB, I, handleOperands(*I));
-
-      pushInstruction(MBB, I);
-      handleSendMsg(MBB, I);
-    }
-
-    // Wait for everything at the end of the MBB
-    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
-  }
-
-  return Changes;
-}
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
deleted file mode 100644
index 211666a9bdb..00000000000
--- a/lib/Target/R600/SIInstrFormats.td
+++ /dev/null
@@ -1,673 +0,0 @@
-//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// SI Instruction format definitions.
-//
-//===----------------------------------------------------------------------===//
-
-class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
-    AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
-
-  field bits<1> VM_CNT = 0;
-  field bits<1> EXP_CNT = 0;
-  field bits<1> LGKM_CNT = 0;
-
-  field bits<1> SALU = 0;
-  field bits<1> VALU = 0;
-
-  field bits<1> SOP1 = 0;
-  field bits<1> SOP2 = 0;
-  field bits<1> SOPC = 0;
-  field bits<1> SOPK = 0;
-  field bits<1> SOPP = 0;
-
-  field bits<1> VOP1 = 0;
-  field bits<1> VOP2 = 0;
-  field bits<1> VOP3 = 0;
-  field bits<1> VOPC = 0;
-
-  field bits<1> MUBUF = 0;
-  field bits<1> MTBUF = 0;
-  field bits<1> SMRD = 0;
-  field bits<1> DS = 0;
-  field bits<1> MIMG = 0;
-  field bits<1> FLAT = 0;
-  field bits<1> WQM = 0;
-  field bits<1> VGPRSpill = 0;
-
-  // These need to be kept in sync with the enum in SIInstrFlags.
-  let TSFlags{0} = VM_CNT;
-  let TSFlags{1} = EXP_CNT;
-  let TSFlags{2} = LGKM_CNT;
-
-  let TSFlags{3} = SALU;
-  let TSFlags{4} = VALU;
-
-  let TSFlags{5} = SOP1;
-  let TSFlags{6} = SOP2;
-  let TSFlags{7} = SOPC;
-  let TSFlags{8} = SOPK;
-  let TSFlags{9} = SOPP;
-
-  let TSFlags{10} = VOP1;
-  let TSFlags{11} = VOP2;
-  let TSFlags{12} = VOP3;
-  let TSFlags{13} = VOPC;
-
-  let TSFlags{14} = MUBUF;
-  let TSFlags{15} = MTBUF;
-  let TSFlags{16} = SMRD;
-  let TSFlags{17} = DS;
-  let TSFlags{18} = MIMG;
-  let TSFlags{19} = FLAT;
-  let TSFlags{20} = WQM;
-  let TSFlags{21} = VGPRSpill;
-
-  // Most instructions require adjustments after selection to satisfy
-  // operand requirements.
-  let hasPostISelHook = 1;
-  let SchedRW = [Write32Bit];
-}
-
-class Enc32 {
-  field bits<32> Inst;
-  int Size = 4;
-}
-
-class Enc64 {
-  field bits<64> Inst;
-  int Size = 8;
-}
-
-class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
-def VOPDstVCC : VOPDstOperand <VCCReg>;
-
-let Uses = [EXEC] in {
-
-class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> {
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
-  let VALU = 1;
-}
-
-class VOPCCommon <dag ins, string asm, list<dag> pattern> :
-    VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> {
-
-  let DisableEncoding = "$dst";
-  let VOPC = 1;
-  let Size = 4;
-}
-
-class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> :
-    VOPAnyCommon <outs, ins, asm, pattern> {
-
-  let VOP1 = 1;
-  let Size = 4;
-}
-
-class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> :
-    VOPAnyCommon <outs, ins, asm, pattern> {
-
-  let VOP2 = 1;
-  let Size = 4;
-}
-
-class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
-    VOPAnyCommon <outs, ins, asm, pattern> {
-
-  // Using complex patterns gives VOP3 patterns a very high complexity rating,
-  // but standalone patterns are almost always prefered, so we need to adjust the
-  // priority lower.  The goal is to use a high number to reduce complexity to
-  // zero (or less than zero).
-  let AddedComplexity = -1000;
-
-  let VOP3 = 1;
-  let VALU = 1;
-
-  let AsmMatchConverter = "cvtVOP3";
-  let isCodeGenOnly = 0;
-
-  int Size = 8;
-}
-
-} // End Uses = [EXEC]
-
-//===----------------------------------------------------------------------===//
-// Scalar operations
-//===----------------------------------------------------------------------===//
-
-class SOP1e <bits<8> op> : Enc32 {
-  bits<7> sdst;
-  bits<8> ssrc0;
-
-  let Inst{7-0} = ssrc0;
-  let Inst{15-8} = op;
-  let Inst{22-16} = sdst;
-  let Inst{31-23} = 0x17d; //encoding;
-}
-
-class SOP2e <bits<7> op> : Enc32 {
-  bits<7> sdst;
-  bits<8> ssrc0;
-  bits<8> ssrc1;
-
-  let Inst{7-0} = ssrc0;
-  let Inst{15-8} = ssrc1;
-  let Inst{22-16} = sdst;
-  let Inst{29-23} = op;
-  let Inst{31-30} = 0x2; // encoding
-}
-
-class SOPCe <bits<7> op> : Enc32 {
-  bits<8> ssrc0;
-  bits<8> ssrc1;
-
-  let Inst{7-0} = ssrc0;
-  let Inst{15-8} = ssrc1;
-  let Inst{22-16} = op;
-  let Inst{31-23} = 0x17e;
-}
-
-class SOPKe <bits<5> op> : Enc32 {
-  bits <7> sdst;
-  bits <16> simm16;
-
-  let Inst{15-0} = simm16;
-  let Inst{22-16} = sdst;
-  let Inst{27-23} = op;
-  let Inst{31-28} = 0xb; //encoding
-}
-
-class SOPK64e <bits<5> op> : Enc64 {
-  bits <7> sdst = 0;
-  bits <16> simm16;
-  bits <32> imm;
-
-  let Inst{15-0} = simm16;
-  let Inst{22-16} = sdst;
-  let Inst{27-23} = op;
-  let Inst{31-28} = 0xb;
-
-  let Inst{63-32} = imm;
-}
-
-class SOPPe <bits<7> op> : Enc32 {
-  bits <16> simm16;
-
-  let Inst{15-0} = simm16;
-  let Inst{22-16} = op;
-  let Inst{31-23} = 0x17f; // encoding
-}
-
-class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
-  bits<7> sdst;
-  bits<7> sbase;
-  bits<8> offset;
-
-  let Inst{7-0} = offset;
-  let Inst{8} = imm;
-  let Inst{14-9} = sbase{6-1};
-  let Inst{21-15} = sdst;
-  let Inst{26-22} = op;
-  let Inst{31-27} = 0x18; //encoding
-}
-
-let SchedRW = [WriteSALU] in {
-class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI<outs, ins, asm, pattern> {
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let isCodeGenOnly = 0;
-  let SALU = 1;
-  let SOP1 = 1;
-}
-
-class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> {
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let isCodeGenOnly = 0;
-  let SALU = 1;
-  let SOP2 = 1;
-
-  let UseNamedOperandTable = 1;
-}
-
-class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-  InstSI<outs, ins, asm, pattern>, SOPCe <op> {
-
-  let DisableEncoding = "$dst";
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let SALU = 1;
-  let SOPC = 1;
-  let isCodeGenOnly = 0;
-
-  let UseNamedOperandTable = 1;
-}
-
-class SOPK <dag outs, dag ins, string asm, list<dag> pattern> :
-   InstSI <outs, ins , asm, pattern> {
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let SALU = 1;
-  let SOPK = 1;
-
-  let UseNamedOperandTable = 1;
-}
-
-class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
-		InstSI <(outs), ins, asm, pattern >, SOPPe <op> {
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let SALU = 1;
-  let SOPP = 1;
-
-  let UseNamedOperandTable = 1;
-}
-
-} // let SchedRW = [WriteSALU]
-
-class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI<outs, ins, asm, pattern> {
-
-  let LGKM_CNT = 1;
-  let SMRD = 1;
-  let mayStore = 0;
-  let mayLoad = 1;
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
-  let SchedRW = [WriteSMEM];
-}
-
-//===----------------------------------------------------------------------===//
-// Vector ALU operations
-//===----------------------------------------------------------------------===//
-
-class VOP1e <bits<8> op> : Enc32 {
-  bits<8> vdst;
-  bits<9> src0;
-
-  let Inst{8-0} = src0;
-  let Inst{16-9} = op;
-  let Inst{24-17} = vdst;
-  let Inst{31-25} = 0x3f; //encoding
-}
-
-class VOP2e <bits<6> op> : Enc32 {
-  bits<8> vdst;
-  bits<9> src0;
-  bits<8> src1;
-
-  let Inst{8-0} = src0;
-  let Inst{16-9} = src1;
-  let Inst{24-17} = vdst;
-  let Inst{30-25} = op;
-  let Inst{31} = 0x0; //encoding
-}
-
-class VOP2_MADKe <bits<6> op> : Enc64 {
-
-  bits<8>  vdst;
-  bits<9>  src0;
-  bits<8>  vsrc1;
-  bits<32> src2;
-
-  let Inst{8-0} = src0;
-  let Inst{16-9} = vsrc1;
-  let Inst{24-17} = vdst;
-  let Inst{30-25} = op;
-  let Inst{31} = 0x0; // encoding
-  let Inst{63-32} = src2;
-}
-
-class VOP3e <bits<9> op> : Enc64 {
-  bits<8> vdst;
-  bits<2> src0_modifiers;
-  bits<9> src0;
-  bits<2> src1_modifiers;
-  bits<9> src1;
-  bits<2> src2_modifiers;
-  bits<9> src2;
-  bits<1> clamp;
-  bits<2> omod;
-
-  let Inst{7-0} = vdst;
-  let Inst{8} = src0_modifiers{1};
-  let Inst{9} = src1_modifiers{1};
-  let Inst{10} = src2_modifiers{1};
-  let Inst{11} = clamp;
-  let Inst{25-17} = op;
-  let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = src0;
-  let Inst{49-41} = src1;
-  let Inst{58-50} = src2;
-  let Inst{60-59} = omod;
-  let Inst{61} = src0_modifiers{0};
-  let Inst{62} = src1_modifiers{0};
-  let Inst{63} = src2_modifiers{0};
-}
-
-class VOP3be <bits<9> op> : Enc64 {
-  bits<8> vdst;
-  bits<2> src0_modifiers;
-  bits<9> src0;
-  bits<2> src1_modifiers;
-  bits<9> src1;
-  bits<2> src2_modifiers;
-  bits<9> src2;
-  bits<7> sdst;
-  bits<2> omod;
-
-  let Inst{7-0} = vdst;
-  let Inst{14-8} = sdst;
-  let Inst{25-17} = op;
-  let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = src0;
-  let Inst{49-41} = src1;
-  let Inst{58-50} = src2;
-  let Inst{60-59} = omod;
-  let Inst{61} = src0_modifiers{0};
-  let Inst{62} = src1_modifiers{0};
-  let Inst{63} = src2_modifiers{0};
-}
-
-class VOPCe <bits<8> op> : Enc32 {
-  bits<9> src0;
-  bits<8> vsrc1;
-
-  let Inst{8-0} = src0;
-  let Inst{16-9} = vsrc1;
-  let Inst{24-17} = op;
-  let Inst{31-25} = 0x3e;
-}
-
-class VINTRPe <bits<2> op> : Enc32 {
-  bits<8> vdst;
-  bits<8> vsrc;
-  bits<2> attrchan;
-  bits<6> attr;
-
-  let Inst{7-0} = vsrc;
-  let Inst{9-8} = attrchan;
-  let Inst{15-10} = attr;
-  let Inst{17-16} = op;
-  let Inst{25-18} = vdst;
-  let Inst{31-26} = 0x32; // encoding
-}
-
-class DSe <bits<8> op> : Enc64 {
-  bits<8> vdst;
-  bits<1> gds;
-  bits<8> addr;
-  bits<8> data0;
-  bits<8> data1;
-  bits<8> offset0;
-  bits<8> offset1;
-
-  let Inst{7-0} = offset0;
-  let Inst{15-8} = offset1;
-  let Inst{17} = gds;
-  let Inst{25-18} = op;
-  let Inst{31-26} = 0x36; //encoding
-  let Inst{39-32} = addr;
-  let Inst{47-40} = data0;
-  let Inst{55-48} = data1;
-  let Inst{63-56} = vdst;
-}
-
-class MUBUFe <bits<7> op> : Enc64 {
-  bits<12> offset;
-  bits<1> offen;
-  bits<1> idxen;
-  bits<1> glc;
-  bits<1> addr64;
-  bits<1> lds;
-  bits<8> vaddr;
-  bits<8> vdata;
-  bits<7> srsrc;
-  bits<1> slc;
-  bits<1> tfe;
-  bits<8> soffset;
-
-  let Inst{11-0} = offset;
-  let Inst{12} = offen;
-  let Inst{13} = idxen;
-  let Inst{14} = glc;
-  let Inst{15} = addr64;
-  let Inst{16} = lds;
-  let Inst{24-18} = op;
-  let Inst{31-26} = 0x38; //encoding
-  let Inst{39-32} = vaddr;
-  let Inst{47-40} = vdata;
-  let Inst{52-48} = srsrc{6-2};
-  let Inst{54} = slc;
-  let Inst{55} = tfe;
-  let Inst{63-56} = soffset;
-}
-
-class MTBUFe <bits<3> op> : Enc64 {
-  bits<8> vdata;
-  bits<12> offset;
-  bits<1> offen;
-  bits<1> idxen;
-  bits<1> glc;
-  bits<1> addr64;
-  bits<4> dfmt;
-  bits<3> nfmt;
-  bits<8> vaddr;
-  bits<7> srsrc;
-  bits<1> slc;
-  bits<1> tfe;
-  bits<8> soffset;
-
-  let Inst{11-0} = offset;
-  let Inst{12} = offen;
-  let Inst{13} = idxen;
-  let Inst{14} = glc;
-  let Inst{15} = addr64;
-  let Inst{18-16} = op;
-  let Inst{22-19} = dfmt;
-  let Inst{25-23} = nfmt;
-  let Inst{31-26} = 0x3a; //encoding
-  let Inst{39-32} = vaddr;
-  let Inst{47-40} = vdata;
-  let Inst{52-48} = srsrc{6-2};
-  let Inst{54} = slc;
-  let Inst{55} = tfe;
-  let Inst{63-56} = soffset;
-}
-
-class MIMGe <bits<7> op> : Enc64 {
-  bits<8> vdata;
-  bits<4> dmask;
-  bits<1> unorm;
-  bits<1> glc;
-  bits<1> da;
-  bits<1> r128;
-  bits<1> tfe;
-  bits<1> lwe;
-  bits<1> slc;
-  bits<8> vaddr;
-  bits<7> srsrc;
-  bits<7> ssamp;
-
-  let Inst{11-8} = dmask;
-  let Inst{12} = unorm;
-  let Inst{13} = glc;
-  let Inst{14} = da;
-  let Inst{15} = r128;
-  let Inst{16} = tfe;
-  let Inst{17} = lwe;
-  let Inst{24-18} = op;
-  let Inst{25} = slc;
-  let Inst{31-26} = 0x3c;
-  let Inst{39-32} = vaddr;
-  let Inst{47-40} = vdata;
-  let Inst{52-48} = srsrc{6-2};
-  let Inst{57-53} = ssamp{6-2};
-}
-
-class FLATe<bits<7> op> : Enc64 {
-  bits<8> addr;
-  bits<8> data;
-  bits<8> vdst;
-  bits<1> slc;
-  bits<1> glc;
-  bits<1> tfe;
-
-  // 15-0 is reserved.
-  let Inst{16} = glc;
-  let Inst{17} = slc;
-  let Inst{24-18} = op;
-  let Inst{31-26} = 0x37; // Encoding.
-  let Inst{39-32} = addr;
-  let Inst{47-40} = data;
-  // 54-48 is reserved.
-  let Inst{55} = tfe;
-  let Inst{63-56} = vdst;
-}
-
-class EXPe : Enc64 {
-  bits<4> en;
-  bits<6> tgt;
-  bits<1> compr;
-  bits<1> done;
-  bits<1> vm;
-  bits<8> vsrc0;
-  bits<8> vsrc1;
-  bits<8> vsrc2;
-  bits<8> vsrc3;
-
-  let Inst{3-0} = en;
-  let Inst{9-4} = tgt;
-  let Inst{10} = compr;
-  let Inst{11} = done;
-  let Inst{12} = vm;
-  let Inst{31-26} = 0x3e;
-  let Inst{39-32} = vsrc0;
-  let Inst{47-40} = vsrc1;
-  let Inst{55-48} = vsrc2;
-  let Inst{63-56} = vsrc3;
-}
-
-let Uses = [EXEC] in {
-
-class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    VOP1Common <outs, ins, asm, pattern>,
-    VOP1e<op> {
-  let isCodeGenOnly = 0;
-}
-
-class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    VOP2Common <outs, ins, asm, pattern>, VOP2e<op> {
-  let isCodeGenOnly = 0;
-}
-
-class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
-    VOPCCommon <ins, asm, pattern>, VOPCe <op>;
-
-class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> {
-  let mayLoad = 1;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-}
-
-} // End Uses = [EXEC]
-
-//===----------------------------------------------------------------------===//
-// Vector I/O operations
-//===----------------------------------------------------------------------===//
-
-let Uses = [EXEC] in {
-
-class DS <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> {
-
-  let LGKM_CNT = 1;
-  let DS = 1;
-  let UseNamedOperandTable = 1;
-  let Uses = [M0];
-
-  // Most instruction load and store data, so set this as the default.
-  let mayLoad = 1;
-  let mayStore = 1;
-
-  let hasSideEffects = 0;
-  let AsmMatchConverter = "cvtDS";
-  let SchedRW = [WriteLDS];
-}
-
-class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI<outs, ins, asm, pattern> {
-
-  let VM_CNT = 1;
-  let EXP_CNT = 1;
-  let MUBUF = 1;
-
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
-  let AsmMatchConverter = "cvtMubuf";
-  let SchedRW = [WriteVMEM];
-}
-
-class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI<outs, ins, asm, pattern> {
-
-  let VM_CNT = 1;
-  let EXP_CNT = 1;
-  let MTBUF = 1;
-
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
-  let SchedRW = [WriteVMEM];
-}
-
-class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI<outs, ins, asm, pattern>, FLATe <op> {
-  let FLAT = 1;
-  // Internally, FLAT instruction are executed as both an LDS and a
-  // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
-  // and are not considered done until both have been decremented.
-  let VM_CNT = 1;
-  let LGKM_CNT = 1;
-
-  let Uses = [EXEC, FLAT_SCR]; // M0
-
-  let UseNamedOperandTable = 1;
-  let hasSideEffects = 0;
-  let AsmMatchConverter = "cvtFlat";
-  let SchedRW = [WriteVMEM];
-}
-
-class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern>, MIMGe <op> {
-
-  let VM_CNT = 1;
-  let EXP_CNT = 1;
-  let MIMG = 1;
-
-  let hasSideEffects = 0; // XXX ????
-}
-
-
-} // End Uses = [EXEC]
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
deleted file mode 100644
index d647c25286f..00000000000
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ /dev/null
@@ -1,2723 +0,0 @@
-//===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief SI Implementation of TargetInstrInfo.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "SIInstrInfo.h"
-#include "AMDGPUTargetMachine.h"
-#include "SIDefines.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Support/Debug.h"
-
-using namespace llvm;
-
-SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
-    : AMDGPUInstrInfo(st), RI() {}
-
-//===----------------------------------------------------------------------===//
-// TargetInstrInfo callbacks
-//===----------------------------------------------------------------------===//
-
-static unsigned getNumOperandsNoGlue(SDNode *Node) {
-  unsigned N = Node->getNumOperands();
-  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
-    --N;
-  return N;
-}
-
-static SDValue findChainOperand(SDNode *Load) {
-  SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
-  assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
-  return LastOp;
-}
-
-/// \brief Returns true if both nodes have the same value for the given
-///        operand \p Op, or if both nodes do not have this operand.
-static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
-  unsigned Opc0 = N0->getMachineOpcode();
-  unsigned Opc1 = N1->getMachineOpcode();
-
-  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
-  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
-
-  if (Op0Idx == -1 && Op1Idx == -1)
-    return true;
-
-
-  if ((Op0Idx == -1 && Op1Idx != -1) ||
-      (Op1Idx == -1 && Op0Idx != -1))
-    return false;
-
-  // getNamedOperandIdx returns the index for the MachineInstr's operands,
-  // which includes the result as the first operand. We are indexing into the
-  // MachineSDNode's operands, so we need to skip the result operand to get
-  // the real index.
-  --Op0Idx;
-  --Op1Idx;
-
-  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
-}
-
-bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
-                                                    AliasAnalysis *AA) const {
-  // TODO: The generic check fails for VALU instructions that should be
-  // rematerializable due to implicit reads of exec. We really want all of the
-  // generic logic for this except for this.
-  switch (MI->getOpcode()) {
-  case AMDGPU::V_MOV_B32_e32:
-  case AMDGPU::V_MOV_B32_e64:
-    return true;
-  default:
-    return false;
-  }
-}
-
-bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
-                                          int64_t &Offset0,
-                                          int64_t &Offset1) const {
-  if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
-    return false;
-
-  unsigned Opc0 = Load0->getMachineOpcode();
-  unsigned Opc1 = Load1->getMachineOpcode();
-
-  // Make sure both are actually loads.
-  if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
-    return false;
-
-  if (isDS(Opc0) && isDS(Opc1)) {
-
-    // FIXME: Handle this case:
-    if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
-      return false;
-
-    // Check base reg.
-    if (Load0->getOperand(1) != Load1->getOperand(1))
-      return false;
-
-    // Check chain.
-    if (findChainOperand(Load0) != findChainOperand(Load1))
-      return false;
-
-    // Skip read2 / write2 variants for simplicity.
-    // TODO: We should report true if the used offsets are adjacent (excluded
-    // st64 versions).
-    if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
-        AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
-      return false;
-
-    Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
-    Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
-    return true;
-  }
-
-  if (isSMRD(Opc0) && isSMRD(Opc1)) {
-    assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
-
-    // Check base reg.
-    if (Load0->getOperand(0) != Load1->getOperand(0))
-      return false;
-
-    const ConstantSDNode *Load0Offset =
-        dyn_cast<ConstantSDNode>(Load0->getOperand(1));
-    const ConstantSDNode *Load1Offset =
-        dyn_cast<ConstantSDNode>(Load1->getOperand(1));
-
-    if (!Load0Offset || !Load1Offset)
-      return false;
-
-    // Check chain.
-    if (findChainOperand(Load0) != findChainOperand(Load1))
-      return false;
-
-    Offset0 = Load0Offset->getZExtValue();
-    Offset1 = Load1Offset->getZExtValue();
-    return true;
-  }
-
-  // MUBUF and MTBUF can access the same addresses.
-  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
-
-    // MUBUF and MTBUF have vaddr at different indices.
-    if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
-        findChainOperand(Load0) != findChainOperand(Load1) ||
-        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
-        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
-      return false;
-
-    int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
-    int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
-
-    if (OffIdx0 == -1 || OffIdx1 == -1)
-      return false;
-
-    // getNamedOperandIdx returns the index for MachineInstrs.  Since they
-    // inlcude the output in the operand list, but SDNodes don't, we need to
-    // subtract the index by one.
-    --OffIdx0;
-    --OffIdx1;
-
-    SDValue Off0 = Load0->getOperand(OffIdx0);
-    SDValue Off1 = Load1->getOperand(OffIdx1);
-
-    // The offset might be a FrameIndexSDNode.
-    if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
-      return false;
-
-    Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
-    Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
-    return true;
-  }
-
-  return false;
-}
-
-static bool isStride64(unsigned Opc) {
-  switch (Opc) {
-  case AMDGPU::DS_READ2ST64_B32:
-  case AMDGPU::DS_READ2ST64_B64:
-  case AMDGPU::DS_WRITE2ST64_B32:
-  case AMDGPU::DS_WRITE2ST64_B64:
-    return true;
-  default:
-    return false;
-  }
-}
-
-bool SIInstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt,
-                                       unsigned &BaseReg, unsigned &Offset,
-                                       const TargetRegisterInfo *TRI) const {
-  unsigned Opc = LdSt->getOpcode();
-  if (isDS(Opc)) {
-    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::offset);
-    if (OffsetImm) {
-      // Normal, single offset LDS instruction.
-      const MachineOperand *AddrReg = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::addr);
-
-      BaseReg = AddrReg->getReg();
-      Offset = OffsetImm->getImm();
-      return true;
-    }
-
-    // The 2 offset instructions use offset0 and offset1 instead. We can treat
-    // these as a load with a single offset if the 2 offsets are consecutive. We
-    // will use this for some partially aligned loads.
-    const MachineOperand *Offset0Imm = getNamedOperand(*LdSt,
-                                                       AMDGPU::OpName::offset0);
-    const MachineOperand *Offset1Imm = getNamedOperand(*LdSt,
-                                                       AMDGPU::OpName::offset1);
-
-    uint8_t Offset0 = Offset0Imm->getImm();
-    uint8_t Offset1 = Offset1Imm->getImm();
-    assert(Offset1 > Offset0);
-
-    if (Offset1 - Offset0 == 1) {
-      // Each of these offsets is in element sized units, so we need to convert
-      // to bytes of the individual reads.
-
-      unsigned EltSize;
-      if (LdSt->mayLoad())
-        EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2;
-      else {
-        assert(LdSt->mayStore());
-        int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
-        EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize();
-      }
-
-      if (isStride64(Opc))
-        EltSize *= 64;
-
-      const MachineOperand *AddrReg = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::addr);
-      BaseReg = AddrReg->getReg();
-      Offset = EltSize * Offset0;
-      return true;
-    }
-
-    return false;
-  }
-
-  if (isMUBUF(Opc) || isMTBUF(Opc)) {
-    if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1)
-      return false;
-
-    const MachineOperand *AddrReg = getNamedOperand(*LdSt,
-                                                    AMDGPU::OpName::vaddr);
-    if (!AddrReg)
-      return false;
-
-    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::offset);
-    BaseReg = AddrReg->getReg();
-    Offset = OffsetImm->getImm();
-    return true;
-  }
-
-  if (isSMRD(Opc)) {
-    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
-                                                      AMDGPU::OpName::offset);
-    if (!OffsetImm)
-      return false;
-
-    const MachineOperand *SBaseReg = getNamedOperand(*LdSt,
-                                                     AMDGPU::OpName::sbase);
-    BaseReg = SBaseReg->getReg();
-    Offset = OffsetImm->getImm();
-    return true;
-  }
-
-  return false;
-}
-
-bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
-                                     MachineInstr *SecondLdSt,
-                                     unsigned NumLoads) const {
-  unsigned Opc0 = FirstLdSt->getOpcode();
-  unsigned Opc1 = SecondLdSt->getOpcode();
-
-  // TODO: This needs finer tuning
-  if (NumLoads > 4)
-    return false;
-
-  if (isDS(Opc0) && isDS(Opc1))
-    return true;
-
-  if (isSMRD(Opc0) && isSMRD(Opc1))
-    return true;
-
-  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1)))
-    return true;
-
-  return false;
-}
-
-void
-SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator MI, DebugLoc DL,
-                         unsigned DestReg, unsigned SrcReg,
-                         bool KillSrc) const {
-
-  // If we are trying to copy to or from SCC, there is a bug somewhere else in
-  // the backend.  While it may be theoretically possible to do this, it should
-  // never be necessary.
-  assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
-
-  static const int16_t Sub0_15[] = {
-    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
-    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
-    AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
-    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0
-  };
-
-  static const int16_t Sub0_7[] = {
-    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
-    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0
-  };
-
-  static const int16_t Sub0_3[] = {
-    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0
-  };
-
-  static const int16_t Sub0_2[] = {
-    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0
-  };
-
-  static const int16_t Sub0_1[] = {
-    AMDGPU::sub0, AMDGPU::sub1, 0
-  };
-
-  unsigned Opcode;
-  const int16_t *SubIndices;
-
-  if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
-    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
-            .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-
-  } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
-    if (DestReg == AMDGPU::VCC) {
-      if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
-        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-      } else {
-        // FIXME: Hack until VReg_1 removed.
-        assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
-        BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC)
-          .addImm(0)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-      }
-
-      return;
-    }
-
-    assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
-            .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-
-  } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
-    assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
-    Opcode = AMDGPU::S_MOV_B32;
-    SubIndices = Sub0_3;
-
-  } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
-    assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
-    Opcode = AMDGPU::S_MOV_B32;
-    SubIndices = Sub0_7;
-
-  } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) {
-    assert(AMDGPU::SReg_512RegClass.contains(SrcReg));
-    Opcode = AMDGPU::S_MOV_B32;
-    SubIndices = Sub0_15;
-
-  } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) {
-    assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
-           AMDGPU::SReg_32RegClass.contains(SrcReg));
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
-            .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-
-  } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
-    assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
-           AMDGPU::SReg_64RegClass.contains(SrcReg));
-    Opcode = AMDGPU::V_MOV_B32_e32;
-    SubIndices = Sub0_1;
-
-  } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) {
-    assert(AMDGPU::VReg_96RegClass.contains(SrcReg));
-    Opcode = AMDGPU::V_MOV_B32_e32;
-    SubIndices = Sub0_2;
-
-  } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) {
-    assert(AMDGPU::VReg_128RegClass.contains(SrcReg) ||
-           AMDGPU::SReg_128RegClass.contains(SrcReg));
-    Opcode = AMDGPU::V_MOV_B32_e32;
-    SubIndices = Sub0_3;
-
-  } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) {
-    assert(AMDGPU::VReg_256RegClass.contains(SrcReg) ||
-           AMDGPU::SReg_256RegClass.contains(SrcReg));
-    Opcode = AMDGPU::V_MOV_B32_e32;
-    SubIndices = Sub0_7;
-
-  } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) {
-    assert(AMDGPU::VReg_512RegClass.contains(SrcReg) ||
-           AMDGPU::SReg_512RegClass.contains(SrcReg));
-    Opcode = AMDGPU::V_MOV_B32_e32;
-    SubIndices = Sub0_15;
-
-  } else {
-    llvm_unreachable("Can't copy register!");
-  }
-
-  while (unsigned SubIdx = *SubIndices++) {
-    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
-      get(Opcode), RI.getSubReg(DestReg, SubIdx));
-
-    Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc));
-
-    if (*SubIndices)
-      Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
-  }
-}
-
-unsigned SIInstrInfo::commuteOpcode(const MachineInstr &MI) const {
-  const unsigned Opcode = MI.getOpcode();
-
-  int NewOpc;
-
-  // Try to map original to commuted opcode
-  NewOpc = AMDGPU::getCommuteRev(Opcode);
-  // Check if the commuted (REV) opcode exists on the target.
-  if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1)
-    return NewOpc;
-
-  // Try to map commuted to original opcode
-  NewOpc = AMDGPU::getCommuteOrig(Opcode);
-  // Check if the original (non-REV) opcode exists on the target.
-  if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1)
-    return NewOpc;
-
-  return Opcode;
-}
-
-unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
-
-  if (DstRC->getSize() == 4) {
-    return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
-  } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
-    return AMDGPU::S_MOV_B64;
-  } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) {
-    return  AMDGPU::V_MOV_B64_PSEUDO;
-  }
-  return AMDGPU::COPY;
-}
-
-void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator MI,
-                                      unsigned SrcReg, bool isKill,
-                                      int FrameIndex,
-                                      const TargetRegisterClass *RC,
-                                      const TargetRegisterInfo *TRI) const {
-  MachineFunction *MF = MBB.getParent();
-  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  MachineFrameInfo *FrameInfo = MF->getFrameInfo();
-  DebugLoc DL = MBB.findDebugLoc(MI);
-  int Opcode = -1;
-
-  if (RI.isSGPRClass(RC)) {
-    // We are only allowed to create one new instruction when spilling
-    // registers, so we need to use pseudo instruction for spilling
-    // SGPRs.
-    switch (RC->getSize() * 8) {
-      case 32:  Opcode = AMDGPU::SI_SPILL_S32_SAVE;  break;
-      case 64:  Opcode = AMDGPU::SI_SPILL_S64_SAVE;  break;
-      case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
-      case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
-      case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
-    }
-  } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
-    MFI->setHasSpilledVGPRs();
-
-    switch(RC->getSize() * 8) {
-      case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
-      case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
-      case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break;
-      case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break;
-      case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break;
-      case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break;
-    }
-  }
-
-  if (Opcode != -1) {
-    FrameInfo->setObjectAlignment(FrameIndex, 4);
-    BuildMI(MBB, MI, DL, get(Opcode))
-            .addReg(SrcReg)
-            .addFrameIndex(FrameIndex)
-            // Place-holder registers, these will be filled in by
-            // SIPrepareScratchRegs.
-            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-            .addReg(AMDGPU::SGPR0, RegState::Undef);
-  } else {
-    LLVMContext &Ctx = MF->getFunction()->getContext();
-    Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
-                  " spill register");
-    BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
-            .addReg(SrcReg);
-  }
-}
-
-void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator MI,
-                                       unsigned DestReg, int FrameIndex,
-                                       const TargetRegisterClass *RC,
-                                       const TargetRegisterInfo *TRI) const {
-  MachineFunction *MF = MBB.getParent();
-  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  MachineFrameInfo *FrameInfo = MF->getFrameInfo();
-  DebugLoc DL = MBB.findDebugLoc(MI);
-  int Opcode = -1;
-
-  if (RI.isSGPRClass(RC)){
-    switch(RC->getSize() * 8) {
-      case 32:  Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
-      case 64:  Opcode = AMDGPU::SI_SPILL_S64_RESTORE;  break;
-      case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
-      case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
-      case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
-    }
-  } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
-    switch(RC->getSize() * 8) {
-      case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
-      case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
-      case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break;
-      case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break;
-      case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break;
-      case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break;
-    }
-  }
-
-  if (Opcode != -1) {
-    FrameInfo->setObjectAlignment(FrameIndex, 4);
-    BuildMI(MBB, MI, DL, get(Opcode), DestReg)
-            .addFrameIndex(FrameIndex)
-            // Place-holder registers, these will be filled in by
-            // SIPrepareScratchRegs.
-            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-            .addReg(AMDGPU::SGPR0, RegState::Undef);
-
-  } else {
-    LLVMContext &Ctx = MF->getFunction()->getContext();
-    Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
-                  " restore register");
-    BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
-  }
-}
-
-/// \param @Offset Offset in bytes of the FrameIndex being spilled
-unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator MI,
-                                               RegScavenger *RS, unsigned TmpReg,
-                                               unsigned FrameOffset,
-                                               unsigned Size) const {
-  MachineFunction *MF = MBB.getParent();
-  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo*>(ST.getRegisterInfo());
-  DebugLoc DL = MBB.findDebugLoc(MI);
-  unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF);
-  unsigned WavefrontSize = ST.getWavefrontSize();
-
-  unsigned TIDReg = MFI->getTIDReg();
-  if (!MFI->hasCalculatedTID()) {
-    MachineBasicBlock &Entry = MBB.getParent()->front();
-    MachineBasicBlock::iterator Insert = Entry.front();
-    DebugLoc DL = Insert->getDebugLoc();
-
-    TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass);
-    if (TIDReg == AMDGPU::NoRegister)
-      return TIDReg;
-
-
-    if (MFI->getShaderType() == ShaderType::COMPUTE &&
-        WorkGroupSize > WavefrontSize) {
-
-      unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X);
-      unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y);
-      unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z);
-      unsigned InputPtrReg =
-          TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR);
-      for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
-        if (!Entry.isLiveIn(Reg))
-          Entry.addLiveIn(Reg);
-      }
-
-      RS->enterBasicBlock(&Entry);
-      unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
-      unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
-      BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
-              .addReg(InputPtrReg)
-              .addImm(SI::KernelInputOffsets::NGROUPS_Z);
-      BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
-              .addReg(InputPtrReg)
-              .addImm(SI::KernelInputOffsets::NGROUPS_Y);
-
-      // NGROUPS.X * NGROUPS.Y
-      BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
-              .addReg(STmp1)
-              .addReg(STmp0);
-      // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
-      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
-              .addReg(STmp1)
-              .addReg(TIDIGXReg);
-      // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
-      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
-              .addReg(STmp0)
-              .addReg(TIDIGYReg)
-              .addReg(TIDReg);
-      // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
-      BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
-              .addReg(TIDReg)
-              .addReg(TIDIGZReg);
-    } else {
-      // Get the wave id
-      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
-              TIDReg)
-              .addImm(-1)
-              .addImm(0);
-
-      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
-              TIDReg)
-              .addImm(-1)
-              .addReg(TIDReg);
-    }
-
-    BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
-            TIDReg)
-            .addImm(2)
-            .addReg(TIDReg);
-    MFI->setTIDReg(TIDReg);
-  }
-
-  // Add FrameIndex to LDS offset
-  unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize);
-  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
-          .addImm(LDSOffset)
-          .addReg(TIDReg);
-
-  return TmpReg;
-}
-
-void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
-                             int Count) const {
-  while (Count > 0) {
-    int Arg;
-    if (Count >= 8)
-      Arg = 7;
-    else
-      Arg = Count - 1;
-    Count -= 8;
-    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP))
-            .addImm(Arg);
-  }
-}
-
-bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MBB.findDebugLoc(MI);
-  switch (MI->getOpcode()) {
-  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
-
-  case AMDGPU::SI_CONSTDATA_PTR: {
-    unsigned Reg = MI->getOperand(0).getReg();
-    unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
-    unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
-
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg);
-
-    // Add 32-bit offset from this instruction to the start of the constant data.
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo)
-            .addReg(RegLo)
-            .addTargetIndex(AMDGPU::TI_CONSTDATA_START)
-            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit);
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi)
-            .addReg(RegHi)
-            .addImm(0)
-            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit)
-            .addReg(AMDGPU::SCC, RegState::Implicit);
-    MI->eraseFromParent();
-    break;
-  }
-  case AMDGPU::SGPR_USE:
-    // This is just a placeholder for register allocation.
-    MI->eraseFromParent();
-    break;
-
-  case AMDGPU::V_MOV_B64_PSEUDO: {
-    unsigned Dst = MI->getOperand(0).getReg();
-    unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
-    unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
-
-    const MachineOperand &SrcOp = MI->getOperand(1);
-    // FIXME: Will this work for 64-bit floating point immediates?
-    assert(!SrcOp.isFPImm());
-    if (SrcOp.isImm()) {
-      APInt Imm(64, SrcOp.getImm());
-      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
-              .addImm(Imm.getLoBits(32).getZExtValue())
-              .addReg(Dst, RegState::Implicit);
-      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
-              .addImm(Imm.getHiBits(32).getZExtValue())
-              .addReg(Dst, RegState::Implicit);
-    } else {
-      assert(SrcOp.isReg());
-      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
-              .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
-              .addReg(Dst, RegState::Implicit);
-      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
-              .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
-              .addReg(Dst, RegState::Implicit);
-    }
-    MI->eraseFromParent();
-    break;
-  }
-
-  case AMDGPU::V_CNDMASK_B64_PSEUDO: {
-    unsigned Dst = MI->getOperand(0).getReg();
-    unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
-    unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
-    unsigned Src0 = MI->getOperand(1).getReg();
-    unsigned Src1 = MI->getOperand(2).getReg();
-    const MachineOperand &SrcCond = MI->getOperand(3);
-
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
-        .addReg(RI.getSubReg(Src0, AMDGPU::sub0))
-        .addReg(RI.getSubReg(Src1, AMDGPU::sub0))
-        .addOperand(SrcCond);
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
-        .addReg(RI.getSubReg(Src0, AMDGPU::sub1))
-        .addReg(RI.getSubReg(Src1, AMDGPU::sub1))
-        .addOperand(SrcCond);
-    MI->eraseFromParent();
-    break;
-  }
-  }
-  return true;
-}
-
-MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
-                                              bool NewMI) const {
-
-  if (MI->getNumOperands() < 3)
-    return nullptr;
-
-  int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src0);
-  assert(Src0Idx != -1 && "Should always have src0 operand");
-
-  MachineOperand &Src0 = MI->getOperand(Src0Idx);
-  if (!Src0.isReg())
-    return nullptr;
-
-  int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src1);
-  if (Src1Idx == -1)
-    return nullptr;
-
-  MachineOperand &Src1 = MI->getOperand(Src1Idx);
-
-  // Make sure it's legal to commute operands for VOP2.
-  if (isVOP2(MI->getOpcode()) &&
-      (!isOperandLegal(MI, Src0Idx, &Src1) ||
-       !isOperandLegal(MI, Src1Idx, &Src0))) {
-    return nullptr;
-  }
-
-  if (!Src1.isReg()) {
-    // Allow commuting instructions with Imm operands.
-    if (NewMI || !Src1.isImm() ||
-       (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
-      return nullptr;
-    }
-
-    // Be sure to copy the source modifiers to the right place.
-    if (MachineOperand *Src0Mods
-          = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
-      MachineOperand *Src1Mods
-        = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers);
-
-      int Src0ModsVal = Src0Mods->getImm();
-      if (!Src1Mods && Src0ModsVal != 0)
-        return nullptr;
-
-      // XXX - This assert might be a lie. It might be useful to have a neg
-      // modifier with 0.0.
-      int Src1ModsVal = Src1Mods->getImm();
-      assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates");
-
-      Src1Mods->setImm(Src0ModsVal);
-      Src0Mods->setImm(Src1ModsVal);
-    }
-
-    unsigned Reg = Src0.getReg();
-    unsigned SubReg = Src0.getSubReg();
-    if (Src1.isImm())
-      Src0.ChangeToImmediate(Src1.getImm());
-    else
-      llvm_unreachable("Should only have immediates");
-
-    Src1.ChangeToRegister(Reg, false);
-    Src1.setSubReg(SubReg);
-  } else {
-    MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
-  }
-
-  if (MI)
-    MI->setDesc(get(commuteOpcode(*MI)));
-
-  return MI;
-}
-
-// This needs to be implemented because the source modifiers may be inserted
-// between the true commutable operands, and the base
-// TargetInstrInfo::commuteInstruction uses it.
-bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
-                                        unsigned &SrcOpIdx1,
-                                        unsigned &SrcOpIdx2) const {
-  const MCInstrDesc &MCID = MI->getDesc();
-  if (!MCID.isCommutable())
-    return false;
-
-  unsigned Opc = MI->getOpcode();
-  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
-  if (Src0Idx == -1)
-    return false;
-
-  // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
-  // immediate.
-  if (!MI->getOperand(Src0Idx).isReg())
-    return false;
-
-  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
-  if (Src1Idx == -1)
-    return false;
-
-  if (!MI->getOperand(Src1Idx).isReg())
-    return false;
-
-  // If any source modifiers are set, the generic instruction commuting won't
-  // understand how to copy the source modifiers.
-  if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
-      hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
-    return false;
-
-  SrcOpIdx1 = Src0Idx;
-  SrcOpIdx2 = Src1Idx;
-  return true;
-}
-
-MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
-                                         MachineBasicBlock::iterator I,
-                                         unsigned DstReg,
-                                         unsigned SrcReg) const {
-  return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32),
-                 DstReg) .addReg(SrcReg);
-}
-
-bool SIInstrInfo::isMov(unsigned Opcode) const {
-  switch(Opcode) {
-  default: return false;
-  case AMDGPU::S_MOV_B32:
-  case AMDGPU::S_MOV_B64:
-  case AMDGPU::V_MOV_B32_e32:
-  case AMDGPU::V_MOV_B32_e64:
-    return true;
-  }
-}
-
-bool
-SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
-  return RC != &AMDGPU::EXECRegRegClass;
-}
-
-static void removeModOperands(MachineInstr &MI) {
-  unsigned Opc = MI.getOpcode();
-  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
-                                              AMDGPU::OpName::src0_modifiers);
-  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
-                                              AMDGPU::OpName::src1_modifiers);
-  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
-                                              AMDGPU::OpName::src2_modifiers);
-
-  MI.RemoveOperand(Src2ModIdx);
-  MI.RemoveOperand(Src1ModIdx);
-  MI.RemoveOperand(Src0ModIdx);
-}
-
-bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
-                                unsigned Reg, MachineRegisterInfo *MRI) const {
-  if (!MRI->hasOneNonDBGUse(Reg))
-    return false;
-
-  unsigned Opc = UseMI->getOpcode();
-  if (Opc == AMDGPU::V_MAD_F32) {
-    // Don't fold if we are using source modifiers. The new VOP2 instructions
-    // don't have them.
-    if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) ||
-        hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) ||
-        hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) {
-      return false;
-    }
-
-    MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0);
-    MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1);
-    MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2);
-
-    // Multiplied part is the constant: Use v_madmk_f32
-    // We should only expect these to be on src0 due to canonicalizations.
-    if (Src0->isReg() && Src0->getReg() == Reg) {
-      if (!Src1->isReg() ||
-          (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
-        return false;
-
-      if (!Src2->isReg() ||
-          (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))))
-        return false;
-
-      // We need to do some weird looking operand shuffling since the madmk
-      // operands are out of the normal expected order with the multiplied
-      // constant as the last operand.
-      //
-      // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1
-      // src0 -> src2 K
-      // src1 -> src0
-      // src2 -> src1
-
-      const int64_t Imm = DefMI->getOperand(1).getImm();
-
-      // FIXME: This would be a lot easier if we could return a new instruction
-      // instead of having to modify in place.
-
-      // Remove these first since they are at the end.
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
-                                                      AMDGPU::OpName::omod));
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
-                                                      AMDGPU::OpName::clamp));
-
-      unsigned Src1Reg = Src1->getReg();
-      unsigned Src1SubReg = Src1->getSubReg();
-      unsigned Src2Reg = Src2->getReg();
-      unsigned Src2SubReg = Src2->getSubReg();
-      Src0->setReg(Src1Reg);
-      Src0->setSubReg(Src1SubReg);
-      Src0->setIsKill(Src1->isKill());
-
-      Src1->setReg(Src2Reg);
-      Src1->setSubReg(Src2SubReg);
-      Src1->setIsKill(Src2->isKill());
-
-      Src2->ChangeToImmediate(Imm);
-
-      removeModOperands(*UseMI);
-      UseMI->setDesc(get(AMDGPU::V_MADMK_F32));
-
-      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
-      if (DeleteDef)
-        DefMI->eraseFromParent();
-
-      return true;
-    }
-
-    // Added part is the constant: Use v_madak_f32
-    if (Src2->isReg() && Src2->getReg() == Reg) {
-      // Not allowed to use constant bus for another operand.
-      // We can however allow an inline immediate as src0.
-      if (!Src0->isImm() &&
-          (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
-        return false;
-
-      if (!Src1->isReg() ||
-          (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
-        return false;
-
-      const int64_t Imm = DefMI->getOperand(1).getImm();
-
-      // FIXME: This would be a lot easier if we could return a new instruction
-      // instead of having to modify in place.
-
-      // Remove these first since they are at the end.
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
-                                                      AMDGPU::OpName::omod));
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
-                                                      AMDGPU::OpName::clamp));
-
-      Src2->ChangeToImmediate(Imm);
-
-      // These come before src2.
-      removeModOperands(*UseMI);
-      UseMI->setDesc(get(AMDGPU::V_MADAK_F32));
-
-      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
-      if (DeleteDef)
-        DefMI->eraseFromParent();
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-bool
-SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
-                                         AliasAnalysis *AA) const {
-  switch(MI->getOpcode()) {
-  default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA);
-  case AMDGPU::S_MOV_B32:
-  case AMDGPU::S_MOV_B64:
-  case AMDGPU::V_MOV_B32_e32:
-    return MI->getOperand(1).isImm();
-  }
-}
-
-static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
-                                int WidthB, int OffsetB) {
-  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
-  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
-  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
-  return LowOffset + LowWidth <= HighOffset;
-}
-
-bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
-                                               MachineInstr *MIb) const {
-  unsigned BaseReg0, Offset0;
-  unsigned BaseReg1, Offset1;
-
-  if (getLdStBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
-      getLdStBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
-    assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() &&
-           "read2 / write2 not expected here yet");
-    unsigned Width0 = (*MIa->memoperands_begin())->getSize();
-    unsigned Width1 = (*MIb->memoperands_begin())->getSize();
-    if (BaseReg0 == BaseReg1 &&
-        offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
-                                                  MachineInstr *MIb,
-                                                  AliasAnalysis *AA) const {
-  unsigned Opc0 = MIa->getOpcode();
-  unsigned Opc1 = MIb->getOpcode();
-
-  assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
-         "MIa must load from or modify a memory location");
-  assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
-         "MIb must load from or modify a memory location");
-
-  if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects())
-    return false;
-
-  // XXX - Can we relax this between address spaces?
-  if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
-    return false;
-
-  // TODO: Should we check the address space from the MachineMemOperand? That
-  // would allow us to distinguish objects we know don't alias based on the
-  // underlying addres space, even if it was lowered to a different one,
-  // e.g. private accesses lowered to use MUBUF instructions on a scratch
-  // buffer.
-  if (isDS(Opc0)) {
-    if (isDS(Opc1))
-      return checkInstOffsetsDoNotOverlap(MIa, MIb);
-
-    return !isFLAT(Opc1);
-  }
-
-  if (isMUBUF(Opc0) || isMTBUF(Opc0)) {
-    if (isMUBUF(Opc1) || isMTBUF(Opc1))
-      return checkInstOffsetsDoNotOverlap(MIa, MIb);
-
-    return !isFLAT(Opc1) && !isSMRD(Opc1);
-  }
-
-  if (isSMRD(Opc0)) {
-    if (isSMRD(Opc1))
-      return checkInstOffsetsDoNotOverlap(MIa, MIb);
-
-    return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0);
-  }
-
-  if (isFLAT(Opc0)) {
-    if (isFLAT(Opc1))
-      return checkInstOffsetsDoNotOverlap(MIa, MIb);
-
-    return false;
-  }
-
-  return false;
-}
-
-bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
-  int64_t SVal = Imm.getSExtValue();
-  if (SVal >= -16 && SVal <= 64)
-    return true;
-
-  if (Imm.getBitWidth() == 64) {
-    uint64_t Val = Imm.getZExtValue();
-    return (DoubleToBits(0.0) == Val) ||
-           (DoubleToBits(1.0) == Val) ||
-           (DoubleToBits(-1.0) == Val) ||
-           (DoubleToBits(0.5) == Val) ||
-           (DoubleToBits(-0.5) == Val) ||
-           (DoubleToBits(2.0) == Val) ||
-           (DoubleToBits(-2.0) == Val) ||
-           (DoubleToBits(4.0) == Val) ||
-           (DoubleToBits(-4.0) == Val);
-  }
-
-  // The actual type of the operand does not seem to matter as long
-  // as the bits match one of the inline immediate values.  For example:
-  //
-  // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
-  // so it is a legal inline immediate.
-  //
-  // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
-  // floating-point, so it is a legal inline immediate.
-  uint32_t Val = Imm.getZExtValue();
-
-  return (FloatToBits(0.0f) == Val) ||
-         (FloatToBits(1.0f) == Val) ||
-         (FloatToBits(-1.0f) == Val) ||
-         (FloatToBits(0.5f) == Val) ||
-         (FloatToBits(-0.5f) == Val) ||
-         (FloatToBits(2.0f) == Val) ||
-         (FloatToBits(-2.0f) == Val) ||
-         (FloatToBits(4.0f) == Val) ||
-         (FloatToBits(-4.0f) == Val);
-}
-
-bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
-                                   unsigned OpSize) const {
-  if (MO.isImm()) {
-    // MachineOperand provides no way to tell the true operand size, since it
-    // only records a 64-bit value. We need to know the size to determine if a
-    // 32-bit floating point immediate bit pattern is legal for an integer
-    // immediate. It would be for any 32-bit integer operand, but would not be
-    // for a 64-bit one.
-
-    unsigned BitSize = 8 * OpSize;
-    return isInlineConstant(APInt(BitSize, MO.getImm(), true));
-  }
-
-  return false;
-}
-
-bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO,
-                                    unsigned OpSize) const {
-  return MO.isImm() && !isInlineConstant(MO, OpSize);
-}
-
-static bool compareMachineOp(const MachineOperand &Op0,
-                             const MachineOperand &Op1) {
-  if (Op0.getType() != Op1.getType())
-    return false;
-
-  switch (Op0.getType()) {
-  case MachineOperand::MO_Register:
-    return Op0.getReg() == Op1.getReg();
-  case MachineOperand::MO_Immediate:
-    return Op0.getImm() == Op1.getImm();
-  default:
-    llvm_unreachable("Didn't expect to be comparing these operand types");
-  }
-}
-
-bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
-                                 const MachineOperand &MO) const {
-  const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
-
-  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
-
-  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
-    return true;
-
-  if (OpInfo.RegClass < 0)
-    return false;
-
-  unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize();
-  if (isLiteralConstant(MO, OpSize))
-    return RI.opCanUseLiteralConstant(OpInfo.OperandType);
-
-  return RI.opCanUseInlineConstant(OpInfo.OperandType);
-}
-
-bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
-  int Op32 = AMDGPU::getVOPe32(Opcode);
-  if (Op32 == -1)
-    return false;
-
-  return pseudoToMCOpcode(Op32) != -1;
-}
-
-bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
-  // The src0_modifier operand is present on all instructions
-  // that have modifiers.
-
-  return AMDGPU::getNamedOperandIdx(Opcode,
-                                    AMDGPU::OpName::src0_modifiers) != -1;
-}
-
-bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
-                                  unsigned OpName) const {
-  const MachineOperand *Mods = getNamedOperand(MI, OpName);
-  return Mods && Mods->getImm();
-}
-
-bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
-                                  const MachineOperand &MO,
-                                  unsigned OpSize) const {
-  // Literal constants use the constant bus.
-  if (isLiteralConstant(MO, OpSize))
-    return true;
-
-  if (!MO.isReg() || !MO.isUse())
-    return false;
-
-  if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-    return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
-
-  // FLAT_SCR is just an SGPR pair.
-  if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
-    return true;
-
-  // EXEC register uses the constant bus.
-  if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
-    return true;
-
-  // SGPRs use the constant bus
-  if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
-      (!MO.isImplicit() &&
-      (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
-       AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) {
-    return true;
-  }
-
-  return false;
-}
-
-bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
-                                    StringRef &ErrInfo) const {
-  uint16_t Opcode = MI->getOpcode();
-  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
-  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
-  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
-
-  // Make sure the number of operands is correct.
-  const MCInstrDesc &Desc = get(Opcode);
-  if (!Desc.isVariadic() &&
-      Desc.getNumOperands() != MI->getNumExplicitOperands()) {
-     ErrInfo = "Instruction has wrong number of operands.";
-     return false;
-  }
-
-  // Make sure the register classes are correct
-  for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
-    if (MI->getOperand(i).isFPImm()) {
-      ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
-                "all fp values to integers.";
-      return false;
-    }
-
-    int RegClass = Desc.OpInfo[i].RegClass;
-
-    switch (Desc.OpInfo[i].OperandType) {
-    case MCOI::OPERAND_REGISTER:
-      if (MI->getOperand(i).isImm()) {
-        ErrInfo = "Illegal immediate value for operand.";
-        return false;
-      }
-      break;
-    case AMDGPU::OPERAND_REG_IMM32:
-      break;
-    case AMDGPU::OPERAND_REG_INLINE_C:
-      if (isLiteralConstant(MI->getOperand(i),
-                            RI.getRegClass(RegClass)->getSize())) {
-        ErrInfo = "Illegal immediate value for operand.";
-        return false;
-      }
-      break;
-    case MCOI::OPERAND_IMMEDIATE:
-      // Check if this operand is an immediate.
-      // FrameIndex operands will be replaced by immediates, so they are
-      // allowed.
-      if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) {
-        ErrInfo = "Expected immediate, but got non-immediate";
-        return false;
-      }
-      // Fall-through
-    default:
-      continue;
-    }
-
-    if (!MI->getOperand(i).isReg())
-      continue;
-
-    if (RegClass != -1) {
-      unsigned Reg = MI->getOperand(i).getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
-        continue;
-
-      const TargetRegisterClass *RC = RI.getRegClass(RegClass);
-      if (!RC->contains(Reg)) {
-        ErrInfo = "Operand has incorrect register class.";
-        return false;
-      }
-    }
-  }
-
-
-  // Verify VOP*
-  if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
-    // Only look at the true operands. Only a real operand can use the constant
-    // bus, and we don't want to check pseudo-operands like the source modifier
-    // flags.
-    const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
-
-    unsigned ConstantBusCount = 0;
-    unsigned SGPRUsed = AMDGPU::NoRegister;
-    for (int OpIdx : OpIndices) {
-      if (OpIdx == -1)
-        break;
-      const MachineOperand &MO = MI->getOperand(OpIdx);
-      if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) {
-        if (MO.isReg()) {
-          if (MO.getReg() != SGPRUsed)
-            ++ConstantBusCount;
-          SGPRUsed = MO.getReg();
-        } else {
-          ++ConstantBusCount;
-        }
-      }
-    }
-    if (ConstantBusCount > 1) {
-      ErrInfo = "VOP* instruction uses the constant bus more than once";
-      return false;
-    }
-  }
-
-  // Verify misc. restrictions on specific instructions.
-  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
-      Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
-    const MachineOperand &Src0 = MI->getOperand(Src0Idx);
-    const MachineOperand &Src1 = MI->getOperand(Src1Idx);
-    const MachineOperand &Src2 = MI->getOperand(Src2Idx);
-    if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
-      if (!compareMachineOp(Src0, Src1) &&
-          !compareMachineOp(Src0, Src2)) {
-        ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default: return AMDGPU::INSTRUCTION_LIST_END;
-  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
-  case AMDGPU::COPY: return AMDGPU::COPY;
-  case AMDGPU::PHI: return AMDGPU::PHI;
-  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
-  case AMDGPU::S_MOV_B32:
-    return MI.getOperand(1).isReg() ?
-           AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
-  case AMDGPU::S_ADD_I32:
-  case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
-  case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
-  case AMDGPU::S_SUB_I32:
-  case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
-  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
-  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
-  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32;
-  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32;
-  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32;
-  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32;
-  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32;
-  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32;
-  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32;
-  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
-  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
-  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
-  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
-  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
-  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
-  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
-  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
-  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
-  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
-  case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
-  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
-  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
-  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
-  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
-  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
-  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
-  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
-  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
-  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
-  case AMDGPU::S_LOAD_DWORD_IMM:
-  case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
-  case AMDGPU::S_LOAD_DWORDX2_IMM:
-  case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
-  case AMDGPU::S_LOAD_DWORDX4_IMM:
-  case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
-  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
-  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
-  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
-  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
-  }
-}
-
-bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
-  return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
-}
-
-const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
-                                                      unsigned OpNo) const {
-  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
-  const MCInstrDesc &Desc = get(MI.getOpcode());
-  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
-      Desc.OpInfo[OpNo].RegClass == -1) {
-    unsigned Reg = MI.getOperand(OpNo).getReg();
-
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
-      return MRI.getRegClass(Reg);
-    return RI.getPhysRegClass(Reg);
-  }
-
-  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
-  return RI.getRegClass(RCID);
-}
-
-bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
-  switch (MI.getOpcode()) {
-  case AMDGPU::COPY:
-  case AMDGPU::REG_SEQUENCE:
-  case AMDGPU::PHI:
-  case AMDGPU::INSERT_SUBREG:
-    return RI.hasVGPRs(getOpRegClass(MI, 0));
-  default:
-    return RI.hasVGPRs(getOpRegClass(MI, OpNo));
-  }
-}
-
-void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
-  MachineBasicBlock::iterator I = MI;
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineOperand &MO = MI->getOperand(OpIdx);
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass;
-  const TargetRegisterClass *RC = RI.getRegClass(RCID);
-  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
-  if (MO.isReg())
-    Opcode = AMDGPU::COPY;
-  else if (RI.isSGPRClass(RC))
-    Opcode = AMDGPU::S_MOV_B32;
-
-
-  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
-  if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
-    VRC = &AMDGPU::VReg_64RegClass;
-  else
-    VRC = &AMDGPU::VGPR_32RegClass;
-
-  unsigned Reg = MRI.createVirtualRegister(VRC);
-  DebugLoc DL = MBB->findDebugLoc(I);
-  BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg)
-    .addOperand(MO);
-  MO.ChangeToRegister(Reg, false);
-}
-
-unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
-                                         MachineRegisterInfo &MRI,
-                                         MachineOperand &SuperReg,
-                                         const TargetRegisterClass *SuperRC,
-                                         unsigned SubIdx,
-                                         const TargetRegisterClass *SubRC)
-                                         const {
-  assert(SuperReg.isReg());
-
-  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
-  unsigned SubReg = MRI.createVirtualRegister(SubRC);
-
-  // Just in case the super register is itself a sub-register, copy it to a new
-  // value so we don't need to worry about merging its subreg index with the
-  // SubIdx passed to this function. The register coalescer should be able to
-  // eliminate this extra copy.
-  MachineBasicBlock *MBB = MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-
-  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
-    .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
-
-  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
-    .addReg(NewSuperReg, 0, SubIdx);
-
-  return SubReg;
-}
-
-MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
-  MachineBasicBlock::iterator MII,
-  MachineRegisterInfo &MRI,
-  MachineOperand &Op,
-  const TargetRegisterClass *SuperRC,
-  unsigned SubIdx,
-  const TargetRegisterClass *SubRC) const {
-  if (Op.isImm()) {
-    // XXX - Is there a better way to do this?
-    if (SubIdx == AMDGPU::sub0)
-      return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF);
-    if (SubIdx == AMDGPU::sub1)
-      return MachineOperand::CreateImm(Op.getImm() >> 32);
-
-    llvm_unreachable("Unhandled register index for immediate");
-  }
-
-  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
-                                       SubIdx, SubRC);
-  return MachineOperand::CreateReg(SubReg, false);
-}
-
-unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
-                                    MachineBasicBlock::iterator MI,
-                                    MachineRegisterInfo &MRI,
-                                    const TargetRegisterClass *RC,
-                                    const MachineOperand &Op) const {
-  MachineBasicBlock *MBB = MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  unsigned Dst = MRI.createVirtualRegister(RC);
-
-  MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
-                             LoDst)
-    .addImm(Op.getImm() & 0xFFFFFFFF);
-  MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
-                             HiDst)
-    .addImm(Op.getImm() >> 32);
-
-  BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst)
-    .addReg(LoDst)
-    .addImm(AMDGPU::sub0)
-    .addReg(HiDst)
-    .addImm(AMDGPU::sub1);
-
-  Worklist.push_back(Lo);
-  Worklist.push_back(Hi);
-
-  return Dst;
-}
-
-// Change the order of operands from (0, 1, 2) to (0, 2, 1)
-void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
-  assert(Inst->getNumExplicitOperands() == 3);
-  MachineOperand Op1 = Inst->getOperand(1);
-  Inst->RemoveOperand(1);
-  Inst->addOperand(Op1);
-}
-
-bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
-                                 const MachineOperand *MO) const {
-  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  const MCInstrDesc &InstDesc = get(MI->getOpcode());
-  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
-  const TargetRegisterClass *DefinedRC =
-      OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
-  if (!MO)
-    MO = &MI->getOperand(OpIdx);
-
-  if (isVALU(InstDesc.Opcode) &&
-      usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
-    unsigned SGPRUsed =
-        MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      if (i == OpIdx)
-        continue;
-      const MachineOperand &Op = MI->getOperand(i);
-      if (Op.isReg() && Op.getReg() != SGPRUsed &&
-          usesConstantBus(MRI, Op, getOpSize(*MI, i))) {
-        return false;
-      }
-    }
-  }
-
-  if (MO->isReg()) {
-    assert(DefinedRC);
-    const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg());
-
-    // In order to be legal, the common sub-class must be equal to the
-    // class of the current operand.  For example:
-    //
-    // v_mov_b32 s0 ; Operand defined as vsrc_32
-    //              ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
-    //
-    // s_sendmsg 0, s0 ; Operand defined as m0reg
-    //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
-
-    return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
-  }
-
-
-  // Handle non-register types that are treated like immediates.
-  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
-
-  if (!DefinedRC) {
-    // This operand expects an immediate.
-    return true;
-  }
-
-  return isImmOperandLegal(MI, OpIdx, *MO);
-}
-
-void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-
-  int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src0);
-  int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src1);
-  int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src2);
-
-  // Legalize VOP2
-  if (isVOP2(MI->getOpcode()) && Src1Idx != -1) {
-    // Legalize src0
-    if (!isOperandLegal(MI, Src0Idx))
-      legalizeOpWithMove(MI, Src0Idx);
-
-    // Legalize src1
-    if (isOperandLegal(MI, Src1Idx))
-      return;
-
-    // Usually src0 of VOP2 instructions allow more types of inputs
-    // than src1, so try to commute the instruction to decrease our
-    // chances of having to insert a MOV instruction to legalize src1.
-    if (MI->isCommutable()) {
-      if (commuteInstruction(MI))
-        // If we are successful in commuting, then we know MI is legal, so
-        // we are done.
-        return;
-    }
-
-    legalizeOpWithMove(MI, Src1Idx);
-    return;
-  }
-
-  // XXX - Do any VOP3 instructions read VCC?
-  // Legalize VOP3
-  if (isVOP3(MI->getOpcode())) {
-    int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx };
-
-    // Find the one SGPR operand we are allowed to use.
-    unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      int Idx = VOP3Idx[i];
-      if (Idx == -1)
-        break;
-      MachineOperand &MO = MI->getOperand(Idx);
-
-      if (MO.isReg()) {
-        if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
-          continue; // VGPRs are legal
-
-        assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction");
-
-        if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
-          SGPRReg = MO.getReg();
-          // We can use one SGPR in each VOP3 instruction.
-          continue;
-        }
-      } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) {
-        // If it is not a register and not a literal constant, then it must be
-        // an inline constant which is always legal.
-        continue;
-      }
-      // If we make it this far, then the operand is not legal and we must
-      // legalize it.
-      legalizeOpWithMove(MI, Idx);
-    }
-  }
-
-  // Legalize REG_SEQUENCE and PHI
-  // The register class of the operands much be the same type as the register
-  // class of the output.
-  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE ||
-      MI->getOpcode() == AMDGPU::PHI) {
-    const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
-    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
-      if (!MI->getOperand(i).isReg() ||
-          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
-        continue;
-      const TargetRegisterClass *OpRC =
-              MRI.getRegClass(MI->getOperand(i).getReg());
-      if (RI.hasVGPRs(OpRC)) {
-        VRC = OpRC;
-      } else {
-        SRC = OpRC;
-      }
-    }
-
-    // If any of the operands are VGPR registers, then they all most be
-    // otherwise we will create illegal VGPR->SGPR copies when legalizing
-    // them.
-    if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) {
-      if (!VRC) {
-        assert(SRC);
-        VRC = RI.getEquivalentVGPRClass(SRC);
-      }
-      RC = VRC;
-    } else {
-      RC = SRC;
-    }
-
-    // Update all the operands so they have the same type.
-    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
-      if (!MI->getOperand(i).isReg() ||
-          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
-        continue;
-      unsigned DstReg = MRI.createVirtualRegister(RC);
-      MachineBasicBlock *InsertBB;
-      MachineBasicBlock::iterator Insert;
-      if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
-        InsertBB = MI->getParent();
-        Insert = MI;
-      } else {
-        // MI is a PHI instruction.
-        InsertBB = MI->getOperand(i + 1).getMBB();
-        Insert = InsertBB->getFirstTerminator();
-      }
-      BuildMI(*InsertBB, Insert, MI->getDebugLoc(),
-              get(AMDGPU::COPY), DstReg)
-              .addOperand(MI->getOperand(i));
-      MI->getOperand(i).setReg(DstReg);
-    }
-  }
-
-  // Legalize INSERT_SUBREG
-  // src0 must have the same register class as dst
-  if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) {
-    unsigned Dst = MI->getOperand(0).getReg();
-    unsigned Src0 = MI->getOperand(1).getReg();
-    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
-    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
-    if (DstRC != Src0RC) {
-      MachineBasicBlock &MBB = *MI->getParent();
-      unsigned NewSrc0 = MRI.createVirtualRegister(DstRC);
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
-              .addReg(Src0);
-      MI->getOperand(1).setReg(NewSrc0);
-    }
-    return;
-  }
-
-  // Legalize MUBUF* instructions
-  // FIXME: If we start using the non-addr64 instructions for compute, we
-  // may need to legalize them here.
-  int SRsrcIdx =
-      AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
-  if (SRsrcIdx != -1) {
-    // We have an MUBUF instruction
-    MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx);
-    unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass;
-    if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
-                                             RI.getRegClass(SRsrcRC))) {
-      // The operands are legal.
-      // FIXME: We may need to legalize operands besided srsrc.
-      return;
-    }
-
-    MachineBasicBlock &MBB = *MI->getParent();
-    // Extract the the ptr from the resource descriptor.
-
-    // SRsrcPtrLo = srsrc:sub0
-    unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc,
-        &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass);
-
-    // SRsrcPtrHi = srsrc:sub1
-    unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc,
-        &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass);
-
-    // Create an empty resource descriptor
-    unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-    unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-    unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-    unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
-    uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
-
-    // Zero64 = 0
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
-            Zero64)
-            .addImm(0);
-
-    // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-            SRsrcFormatLo)
-            .addImm(RsrcDataFormat & 0xFFFFFFFF);
-
-    // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-            SRsrcFormatHi)
-            .addImm(RsrcDataFormat >> 32);
-
-    // NewSRsrc = {Zero64, SRsrcFormat}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
-            NewSRsrc)
-            .addReg(Zero64)
-            .addImm(AMDGPU::sub0_sub1)
-            .addReg(SRsrcFormatLo)
-            .addImm(AMDGPU::sub2)
-            .addReg(SRsrcFormatHi)
-            .addImm(AMDGPU::sub3);
-
-    MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
-    unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
-    unsigned NewVAddrLo;
-    unsigned NewVAddrHi;
-    if (VAddr) {
-      // This is already an ADDR64 instruction so we need to add the pointer
-      // extracted from the resource descriptor to the current value of VAddr.
-      NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-      NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
-      // NewVaddrLo = SRsrcPtrLo + VAddr:sub0
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
-              NewVAddrLo)
-              .addReg(SRsrcPtrLo)
-              .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
-              .addReg(AMDGPU::VCC, RegState::ImplicitDefine);
-
-      // NewVaddrHi = SRsrcPtrHi + VAddr:sub1
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
-              NewVAddrHi)
-              .addReg(SRsrcPtrHi)
-              .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
-              .addReg(AMDGPU::VCC, RegState::ImplicitDefine)
-              .addReg(AMDGPU::VCC, RegState::Implicit);
-
-    } else {
-      // This instructions is the _OFFSET variant, so we need to convert it to
-      // ADDR64.
-      MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
-      MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
-      MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
-
-      // Create the new instruction.
-      unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
-      MachineInstr *Addr64 =
-          BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
-                  .addOperand(*VData)
-                  .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
-                                              // This will be replaced later
-                                              // with the new value of vaddr.
-                  .addOperand(*SRsrc)
-                  .addOperand(*SOffset)
-                  .addOperand(*Offset)
-                  .addImm(0) // glc
-                  .addImm(0) // slc
-                  .addImm(0); // tfe
-
-      MI->removeFromParent();
-      MI = Addr64;
-
-      NewVAddrLo = SRsrcPtrLo;
-      NewVAddrHi = SRsrcPtrHi;
-      VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
-      SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
-    }
-
-    // NewVaddr = {NewVaddrHi, NewVaddrLo}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
-            NewVAddr)
-            .addReg(NewVAddrLo)
-            .addImm(AMDGPU::sub0)
-            .addReg(NewVAddrHi)
-            .addImm(AMDGPU::sub1);
-
-
-    // Update the instruction to use NewVaddr
-    VAddr->setReg(NewVAddr);
-    // Update the instruction to use NewSRsrc
-    SRsrc->setReg(NewSRsrc);
-  }
-}
-
-void SIInstrInfo::splitSMRD(MachineInstr *MI,
-                            const TargetRegisterClass *HalfRC,
-                            unsigned HalfImmOp, unsigned HalfSGPROp,
-                            MachineInstr *&Lo, MachineInstr *&Hi) const {
-
-  DebugLoc DL = MI->getDebugLoc();
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  unsigned RegLo = MRI.createVirtualRegister(HalfRC);
-  unsigned RegHi = MRI.createVirtualRegister(HalfRC);
-  unsigned HalfSize = HalfRC->getSize();
-  const MachineOperand *OffOp =
-      getNamedOperand(*MI, AMDGPU::OpName::offset);
-  const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
-
-  // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes
-  // on VI.
-
-  bool IsKill = SBase->isKill();
-  if (OffOp) {
-    bool isVI =
-        MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
-        AMDGPUSubtarget::VOLCANIC_ISLANDS;
-    unsigned OffScale = isVI ? 1 : 4;
-    // Handle the _IMM variant
-    unsigned LoOffset = OffOp->getImm() * OffScale;
-    unsigned HiOffset = LoOffset + HalfSize;
-    Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo)
-                  // Use addReg instead of addOperand
-                  // to make sure kill flag is cleared.
-                  .addReg(SBase->getReg(), 0, SBase->getSubReg())
-                  .addImm(LoOffset / OffScale);
-
-    if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) {
-      unsigned OffsetSGPR =
-          MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-      BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR)
-              .addImm(HiOffset); // The offset in register is in bytes.
-      Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
-                    .addReg(SBase->getReg(), getKillRegState(IsKill),
-                            SBase->getSubReg())
-                    .addReg(OffsetSGPR);
-    } else {
-      Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi)
-                     .addReg(SBase->getReg(), getKillRegState(IsKill),
-                             SBase->getSubReg())
-                     .addImm(HiOffset / OffScale);
-    }
-  } else {
-    // Handle the _SGPR variant
-    MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff);
-    Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo)
-                  .addReg(SBase->getReg(), 0, SBase->getSubReg())
-                  .addOperand(*SOff);
-    unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-    BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR)
-            .addOperand(*SOff)
-            .addImm(HalfSize);
-    Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp))
-                  .addReg(SBase->getReg(), getKillRegState(IsKill),
-                          SBase->getSubReg())
-                  .addReg(OffsetSGPR);
-  }
-
-  unsigned SubLo, SubHi;
-  switch (HalfSize) {
-    case 4:
-      SubLo = AMDGPU::sub0;
-      SubHi = AMDGPU::sub1;
-      break;
-    case 8:
-      SubLo = AMDGPU::sub0_sub1;
-      SubHi = AMDGPU::sub2_sub3;
-      break;
-    case 16:
-      SubLo = AMDGPU::sub0_sub1_sub2_sub3;
-      SubHi = AMDGPU::sub4_sub5_sub6_sub7;
-      break;
-    case 32:
-      SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
-      SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15;
-      break;
-    default:
-      llvm_unreachable("Unhandled HalfSize");
-  }
-
-  BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE))
-          .addOperand(MI->getOperand(0))
-          .addReg(RegLo)
-          .addImm(SubLo)
-          .addReg(RegHi)
-          .addImm(SubHi);
-}
-
-void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const {
-  MachineBasicBlock *MBB = MI->getParent();
-  switch (MI->getOpcode()) {
-    case AMDGPU::S_LOAD_DWORD_IMM:
-    case AMDGPU::S_LOAD_DWORD_SGPR:
-    case AMDGPU::S_LOAD_DWORDX2_IMM:
-    case AMDGPU::S_LOAD_DWORDX2_SGPR:
-    case AMDGPU::S_LOAD_DWORDX4_IMM:
-    case AMDGPU::S_LOAD_DWORDX4_SGPR: {
-      unsigned NewOpcode = getVALUOp(*MI);
-      unsigned RegOffset;
-      unsigned ImmOffset;
-
-      if (MI->getOperand(2).isReg()) {
-        RegOffset = MI->getOperand(2).getReg();
-        ImmOffset = 0;
-      } else {
-        assert(MI->getOperand(2).isImm());
-        // SMRD instructions take a dword offsets on SI and byte offset on VI
-        // and MUBUF instructions always take a byte offset.
-        ImmOffset = MI->getOperand(2).getImm();
-        if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <=
-            AMDGPUSubtarget::SEA_ISLANDS)
-          ImmOffset <<= 2;
-        RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-
-        if (isUInt<12>(ImmOffset)) {
-          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-                  RegOffset)
-                  .addImm(0);
-        } else {
-          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-                  RegOffset)
-                  .addImm(ImmOffset);
-          ImmOffset = 0;
-        }
-      }
-
-      unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
-      unsigned DWord0 = RegOffset;
-      unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-      unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-      unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-      uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
-
-      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
-              .addImm(0);
-      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
-              .addImm(RsrcDataFormat & 0xFFFFFFFF);
-      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
-              .addImm(RsrcDataFormat >> 32);
-      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
-              .addReg(DWord0)
-              .addImm(AMDGPU::sub0)
-              .addReg(DWord1)
-              .addImm(AMDGPU::sub1)
-              .addReg(DWord2)
-              .addImm(AMDGPU::sub2)
-              .addReg(DWord3)
-              .addImm(AMDGPU::sub3);
-      MI->setDesc(get(NewOpcode));
-      if (MI->getOperand(2).isReg()) {
-        MI->getOperand(2).setReg(SRsrc);
-      } else {
-        MI->getOperand(2).ChangeToRegister(SRsrc, false);
-      }
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0));
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe
-
-      const TargetRegisterClass *NewDstRC =
-          RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass);
-
-      unsigned DstReg = MI->getOperand(0).getReg();
-      unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
-      MRI.replaceRegWith(DstReg, NewDstReg);
-      break;
-    }
-    case AMDGPU::S_LOAD_DWORDX8_IMM:
-    case AMDGPU::S_LOAD_DWORDX8_SGPR: {
-      MachineInstr *Lo, *Hi;
-      splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM,
-                AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi);
-      MI->eraseFromParent();
-      moveSMRDToVALU(Lo, MRI);
-      moveSMRDToVALU(Hi, MRI);
-      break;
-    }
-
-    case AMDGPU::S_LOAD_DWORDX16_IMM:
-    case AMDGPU::S_LOAD_DWORDX16_SGPR: {
-      MachineInstr *Lo, *Hi;
-      splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM,
-                AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi);
-      MI->eraseFromParent();
-      moveSMRDToVALU(Lo, MRI);
-      moveSMRDToVALU(Hi, MRI);
-      break;
-    }
-  }
-}
-
-void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
-  SmallVector<MachineInstr *, 128> Worklist;
-  Worklist.push_back(&TopInst);
-
-  while (!Worklist.empty()) {
-    MachineInstr *Inst = Worklist.pop_back_val();
-    MachineBasicBlock *MBB = Inst->getParent();
-    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-
-    unsigned Opcode = Inst->getOpcode();
-    unsigned NewOpcode = getVALUOp(*Inst);
-
-    // Handle some special cases
-    switch (Opcode) {
-    default:
-      if (isSMRD(Inst->getOpcode())) {
-        moveSMRDToVALU(Inst, MRI);
-      }
-      break;
-    case AMDGPU::S_MOV_B64: {
-      DebugLoc DL = Inst->getDebugLoc();
-
-      // If the source operand is a register we can replace this with a
-      // copy.
-      if (Inst->getOperand(1).isReg()) {
-        MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY))
-          .addOperand(Inst->getOperand(0))
-          .addOperand(Inst->getOperand(1));
-        Worklist.push_back(Copy);
-      } else {
-        // Otherwise, we need to split this into two movs, because there is
-        // no 64-bit VALU move instruction.
-        unsigned Reg = Inst->getOperand(0).getReg();
-        unsigned Dst = split64BitImm(Worklist,
-                                     Inst,
-                                     MRI,
-                                     MRI.getRegClass(Reg),
-                                     Inst->getOperand(1));
-        MRI.replaceRegWith(Reg, Dst);
-      }
-      Inst->eraseFromParent();
-      continue;
-    }
-    case AMDGPU::S_AND_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32);
-      Inst->eraseFromParent();
-      continue;
-
-    case AMDGPU::S_OR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32);
-      Inst->eraseFromParent();
-      continue;
-
-    case AMDGPU::S_XOR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32);
-      Inst->eraseFromParent();
-      continue;
-
-    case AMDGPU::S_NOT_B64:
-      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
-      Inst->eraseFromParent();
-      continue;
-
-    case AMDGPU::S_BCNT1_I32_B64:
-      splitScalar64BitBCNT(Worklist, Inst);
-      Inst->eraseFromParent();
-      continue;
-
-    case AMDGPU::S_BFE_I64: {
-      splitScalar64BitBFE(Worklist, Inst);
-      Inst->eraseFromParent();
-      continue;
-    }
-
-    case AMDGPU::S_LSHL_B32:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-        NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
-        swapOperands(Inst);
-      }
-      break;
-    case AMDGPU::S_ASHR_I32:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-        NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
-        swapOperands(Inst);
-      }
-      break;
-    case AMDGPU::S_LSHR_B32:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-        NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
-        swapOperands(Inst);
-      }
-      break;
-    case AMDGPU::S_LSHL_B64:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-        NewOpcode = AMDGPU::V_LSHLREV_B64;
-        swapOperands(Inst);
-      }
-      break;
-    case AMDGPU::S_ASHR_I64:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-        NewOpcode = AMDGPU::V_ASHRREV_I64;
-        swapOperands(Inst);
-      }
-      break;
-    case AMDGPU::S_LSHR_B64:
-      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-        NewOpcode = AMDGPU::V_LSHRREV_B64;
-        swapOperands(Inst);
-      }
-      break;
-
-    case AMDGPU::S_BFE_U64:
-    case AMDGPU::S_BFM_B64:
-      llvm_unreachable("Moving this op to VALU not implemented");
-    }
-
-    if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
-      // We cannot move this instruction to the VALU, so we should try to
-      // legalize its operands instead.
-      legalizeOperands(Inst);
-      continue;
-    }
-
-    // Use the new VALU Opcode.
-    const MCInstrDesc &NewDesc = get(NewOpcode);
-    Inst->setDesc(NewDesc);
-
-    // Remove any references to SCC. Vector instructions can't read from it, and
-    // We're just about to add the implicit use / defs of VCC, and we don't want
-    // both.
-    for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) {
-      MachineOperand &Op = Inst->getOperand(i);
-      if (Op.isReg() && Op.getReg() == AMDGPU::SCC)
-        Inst->RemoveOperand(i);
-    }
-
-    if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
-      // We are converting these to a BFE, so we need to add the missing
-      // operands for the size and offset.
-      unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
-      Inst->addOperand(MachineOperand::CreateImm(0));
-      Inst->addOperand(MachineOperand::CreateImm(Size));
-
-    } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
-      // The VALU version adds the second operand to the result, so insert an
-      // extra 0 operand.
-      Inst->addOperand(MachineOperand::CreateImm(0));
-    }
-
-    addDescImplicitUseDef(NewDesc, Inst);
-
-    if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
-      const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
-      // If we need to move this to VGPRs, we need to unpack the second operand
-      // back into the 2 separate ones for bit offset and width.
-      assert(OffsetWidthOp.isImm() &&
-             "Scalar BFE is only implemented for constant width and offset");
-      uint32_t Imm = OffsetWidthOp.getImm();
-
-      uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
-      uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
-      Inst->RemoveOperand(2); // Remove old immediate.
-      Inst->addOperand(MachineOperand::CreateImm(Offset));
-      Inst->addOperand(MachineOperand::CreateImm(BitWidth));
-    }
-
-    // Update the destination register class.
-
-    const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
-
-    switch (Opcode) {
-      // For target instructions, getOpRegClass just returns the virtual
-      // register class associated with the operand, so we need to find an
-      // equivalent VGPR register class in order to move the instruction to the
-      // VALU.
-    case AMDGPU::COPY:
-    case AMDGPU::PHI:
-    case AMDGPU::REG_SEQUENCE:
-    case AMDGPU::INSERT_SUBREG:
-      if (RI.hasVGPRs(NewDstRC))
-        continue;
-      NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
-      if (!NewDstRC)
-        continue;
-      break;
-    default:
-      break;
-    }
-
-    unsigned DstReg = Inst->getOperand(0).getReg();
-    unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
-    MRI.replaceRegWith(DstReg, NewDstReg);
-
-    // Legalize the operands
-    legalizeOperands(Inst);
-
-    for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
-           E = MRI.use_end(); I != E; ++I) {
-      MachineInstr &UseMI = *I->getParent();
-      if (!canReadVGPR(UseMI, I.getOperandNo())) {
-        Worklist.push_back(&UseMI);
-      }
-    }
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Indirect addressing callbacks
-//===----------------------------------------------------------------------===//
-
-unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
-                                                 unsigned Channel) const {
-  assert(Channel == 0);
-  return RegIndex;
-}
-
-const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
-  return &AMDGPU::VGPR_32RegClass;
-}
-
-void SIInstrInfo::splitScalar64BitUnaryOp(
-  SmallVectorImpl<MachineInstr *> &Worklist,
-  MachineInstr *Inst,
-  unsigned Opcode) const {
-  MachineBasicBlock &MBB = *Inst->getParent();
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
-  MachineOperand &Dest = Inst->getOperand(0);
-  MachineOperand &Src0 = Inst->getOperand(1);
-  DebugLoc DL = Inst->getDebugLoc();
-
-  MachineBasicBlock::iterator MII = Inst;
-
-  const MCInstrDesc &InstDesc = get(Opcode);
-  const TargetRegisterClass *Src0RC = Src0.isReg() ?
-    MRI.getRegClass(Src0.getReg()) :
-    &AMDGPU::SGPR_32RegClass;
-
-  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
-
-  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
-                                                       AMDGPU::sub0, Src0SubRC);
-
-  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
-  const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
-
-  unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
-  MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
-    .addOperand(SrcReg0Sub0);
-
-  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
-                                                       AMDGPU::sub1, Src0SubRC);
-
-  unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
-  MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
-    .addOperand(SrcReg0Sub1);
-
-  unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
-  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
-    .addReg(DestSub0)
-    .addImm(AMDGPU::sub0)
-    .addReg(DestSub1)
-    .addImm(AMDGPU::sub1);
-
-  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
-
-  // Try to legalize the operands in case we need to swap the order to keep it
-  // valid.
-  Worklist.push_back(LoHalf);
-  Worklist.push_back(HiHalf);
-}
-
-void SIInstrInfo::splitScalar64BitBinaryOp(
-  SmallVectorImpl<MachineInstr *> &Worklist,
-  MachineInstr *Inst,
-  unsigned Opcode) const {
-  MachineBasicBlock &MBB = *Inst->getParent();
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
-  MachineOperand &Dest = Inst->getOperand(0);
-  MachineOperand &Src0 = Inst->getOperand(1);
-  MachineOperand &Src1 = Inst->getOperand(2);
-  DebugLoc DL = Inst->getDebugLoc();
-
-  MachineBasicBlock::iterator MII = Inst;
-
-  const MCInstrDesc &InstDesc = get(Opcode);
-  const TargetRegisterClass *Src0RC = Src0.isReg() ?
-    MRI.getRegClass(Src0.getReg()) :
-    &AMDGPU::SGPR_32RegClass;
-
-  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
-  const TargetRegisterClass *Src1RC = Src1.isReg() ?
-    MRI.getRegClass(Src1.getReg()) :
-    &AMDGPU::SGPR_32RegClass;
-
-  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
-
-  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
-                                                       AMDGPU::sub0, Src0SubRC);
-  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
-                                                       AMDGPU::sub0, Src1SubRC);
-
-  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
-  const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
-
-  unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
-  MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
-    .addOperand(SrcReg0Sub0)
-    .addOperand(SrcReg1Sub0);
-
-  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
-                                                       AMDGPU::sub1, Src0SubRC);
-  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
-                                                       AMDGPU::sub1, Src1SubRC);
-
-  unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
-  MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
-    .addOperand(SrcReg0Sub1)
-    .addOperand(SrcReg1Sub1);
-
-  unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
-  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
-    .addReg(DestSub0)
-    .addImm(AMDGPU::sub0)
-    .addReg(DestSub1)
-    .addImm(AMDGPU::sub1);
-
-  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
-
-  // Try to legalize the operands in case we need to swap the order to keep it
-  // valid.
-  Worklist.push_back(LoHalf);
-  Worklist.push_back(HiHalf);
-}
-
-void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
-                                       MachineInstr *Inst) const {
-  MachineBasicBlock &MBB = *Inst->getParent();
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
-  MachineBasicBlock::iterator MII = Inst;
-  DebugLoc DL = Inst->getDebugLoc();
-
-  MachineOperand &Dest = Inst->getOperand(0);
-  MachineOperand &Src = Inst->getOperand(1);
-
-  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
-  const TargetRegisterClass *SrcRC = Src.isReg() ?
-    MRI.getRegClass(Src.getReg()) :
-    &AMDGPU::SGPR_32RegClass;
-
-  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
-  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
-
-  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
-                                                      AMDGPU::sub0, SrcSubRC);
-  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
-                                                      AMDGPU::sub1, SrcSubRC);
-
-  MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg)
-    .addOperand(SrcRegSub0)
-    .addImm(0);
-
-  MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg)
-    .addOperand(SrcRegSub1)
-    .addReg(MidReg);
-
-  MRI.replaceRegWith(Dest.getReg(), ResultReg);
-
-  Worklist.push_back(First);
-  Worklist.push_back(Second);
-}
-
-void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
-                                      MachineInstr *Inst) const {
-  MachineBasicBlock &MBB = *Inst->getParent();
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  MachineBasicBlock::iterator MII = Inst;
-  DebugLoc DL = Inst->getDebugLoc();
-
-  MachineOperand &Dest = Inst->getOperand(0);
-  uint32_t Imm = Inst->getOperand(2).getImm();
-  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
-  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
-
-  (void) Offset;
-
-  // Only sext_inreg cases handled.
-  assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 &&
-         BitWidth <= 32 &&
-         Offset == 0 &&
-         "Not implemented");
-
-  if (BitWidth < 32) {
-    unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
-
-    BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
-      .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0)
-      .addImm(0)
-      .addImm(BitWidth);
-
-    BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
-      .addImm(31)
-      .addReg(MidRegLo);
-
-    BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
-      .addReg(MidRegLo)
-      .addImm(AMDGPU::sub0)
-      .addReg(MidRegHi)
-      .addImm(AMDGPU::sub1);
-
-    MRI.replaceRegWith(Dest.getReg(), ResultReg);
-    return;
-  }
-
-  MachineOperand &Src = Inst->getOperand(1);
-  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
-
-  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
-    .addImm(31)
-    .addReg(Src.getReg(), 0, AMDGPU::sub0);
-
-  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
-    .addReg(Src.getReg(), 0, AMDGPU::sub0)
-    .addImm(AMDGPU::sub0)
-    .addReg(TmpReg)
-    .addImm(AMDGPU::sub1);
-
-  MRI.replaceRegWith(Dest.getReg(), ResultReg);
-}
-
-void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
-                                        MachineInstr *Inst) const {
-  // Add the implict and explicit register definitions.
-  if (NewDesc.ImplicitUses) {
-    for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
-      unsigned Reg = NewDesc.ImplicitUses[i];
-      Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
-    }
-  }
-
-  if (NewDesc.ImplicitDefs) {
-    for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
-      unsigned Reg = NewDesc.ImplicitDefs[i];
-      Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
-    }
-  }
-}
-
-unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
-                                   int OpIndices[3]) const {
-  const MCInstrDesc &Desc = get(MI->getOpcode());
-
-  // Find the one SGPR operand we are allowed to use.
-  unsigned SGPRReg = AMDGPU::NoRegister;
-
-  // First we need to consider the instruction's operand requirements before
-  // legalizing. Some operands are required to be SGPRs, such as implicit uses
-  // of VCC, but we are still bound by the constant bus requirement to only use
-  // one.
-  //
-  // If the operand's class is an SGPR, we can never move it.
-
-  for (const MachineOperand &MO : MI->implicit_operands()) {
-    // We only care about reads.
-    if (MO.isDef())
-      continue;
-
-    if (MO.getReg() == AMDGPU::VCC)
-      return AMDGPU::VCC;
-
-    if (MO.getReg() == AMDGPU::FLAT_SCR)
-      return AMDGPU::FLAT_SCR;
-  }
-
-  unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
-  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-
-  for (unsigned i = 0; i < 3; ++i) {
-    int Idx = OpIndices[i];
-    if (Idx == -1)
-      break;
-
-    const MachineOperand &MO = MI->getOperand(Idx);
-    if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass))
-      SGPRReg = MO.getReg();
-
-    if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
-      UsedSGPRs[i] = MO.getReg();
-  }
-
-  if (SGPRReg != AMDGPU::NoRegister)
-    return SGPRReg;
-
-  // We don't have a required SGPR operand, so we have a bit more freedom in
-  // selecting operands to move.
-
-  // Try to select the most used SGPR. If an SGPR is equal to one of the
-  // others, we choose that.
-  //
-  // e.g.
-  // V_FMA_F32 v0, s0, s0, s0 -> No moves
-  // V_FMA_F32 v0, s0, s1, s0 -> Move s1
-
-  if (UsedSGPRs[0] != AMDGPU::NoRegister) {
-    if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
-      SGPRReg = UsedSGPRs[0];
-  }
-
-  if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
-    if (UsedSGPRs[1] == UsedSGPRs[2])
-      SGPRReg = UsedSGPRs[1];
-  }
-
-  return SGPRReg;
-}
-
-MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
-                                   MachineBasicBlock *MBB,
-                                   MachineBasicBlock::iterator I,
-                                   unsigned ValueReg,
-                                   unsigned Address, unsigned OffsetReg) const {
-  const DebugLoc &DL = MBB->findDebugLoc(I);
-  unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
-                                      getIndirectIndexBegin(*MBB->getParent()));
-
-  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1))
-          .addReg(IndirectBaseReg, RegState::Define)
-          .addOperand(I->getOperand(0))
-          .addReg(IndirectBaseReg)
-          .addReg(OffsetReg)
-          .addImm(0)
-          .addReg(ValueReg);
-}
-
-MachineInstrBuilder SIInstrInfo::buildIndirectRead(
-                                   MachineBasicBlock *MBB,
-                                   MachineBasicBlock::iterator I,
-                                   unsigned ValueReg,
-                                   unsigned Address, unsigned OffsetReg) const {
-  const DebugLoc &DL = MBB->findDebugLoc(I);
-  unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
-                                      getIndirectIndexBegin(*MBB->getParent()));
-
-  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
-          .addOperand(I->getOperand(0))
-          .addOperand(I->getOperand(1))
-          .addReg(IndirectBaseReg)
-          .addReg(OffsetReg)
-          .addImm(0);
-
-}
-
-void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
-                                            const MachineFunction &MF) const {
-  int End = getIndirectIndexEnd(MF);
-  int Begin = getIndirectIndexBegin(MF);
-
-  if (End == -1)
-    return;
-
-
-  for (int Index = Begin; Index <= End; ++Index)
-    Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 1); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 2); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 3); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 7); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index));
-
-  for (int Index = std::max(0, Begin - 15); Index <= End; ++Index)
-    Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index));
-}
-
-MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
-                                             unsigned OperandName) const {
-  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
-  if (Idx == -1)
-    return nullptr;
-
-  return &MI.getOperand(Idx);
-}
-
-uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
-  uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
-  if (ST.isAmdHsaOS())
-    RsrcDataFormat |= (1ULL << 56);
-
-  return RsrcDataFormat;
-}
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
deleted file mode 100644
index 64b5120841c..00000000000
--- a/lib/Target/R600/SIInstrInfo.h
+++ /dev/null
@@ -1,391 +0,0 @@
-//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Interface definition for SIInstrInfo.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H
-#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H
-
-#include "AMDGPUInstrInfo.h"
-#include "SIDefines.h"
-#include "SIRegisterInfo.h"
-
-namespace llvm {
-
-class SIInstrInfo : public AMDGPUInstrInfo {
-private:
-  const SIRegisterInfo RI;
-
-  unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
-                              MachineRegisterInfo &MRI,
-                              MachineOperand &SuperReg,
-                              const TargetRegisterClass *SuperRC,
-                              unsigned SubIdx,
-                              const TargetRegisterClass *SubRC) const;
-  MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI,
-                                         MachineRegisterInfo &MRI,
-                                         MachineOperand &SuperReg,
-                                         const TargetRegisterClass *SuperRC,
-                                         unsigned SubIdx,
-                                         const TargetRegisterClass *SubRC) const;
-
-  unsigned split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
-                         MachineBasicBlock::iterator MI,
-                         MachineRegisterInfo &MRI,
-                         const TargetRegisterClass *RC,
-                         const MachineOperand &Op) const;
-
-  void swapOperands(MachineBasicBlock::iterator Inst) const;
-
-  void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
-                               MachineInstr *Inst, unsigned Opcode) const;
-
-  void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
-                                MachineInstr *Inst, unsigned Opcode) const;
-
-  void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
-                            MachineInstr *Inst) const;
-  void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
-                           MachineInstr *Inst) const;
-
-  void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
-
-  bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
-                                    MachineInstr *MIb) const;
-
-  unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const;
-
-public:
-  explicit SIInstrInfo(const AMDGPUSubtarget &st);
-
-  const SIRegisterInfo &getRegisterInfo() const override {
-    return RI;
-  }
-
-  bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
-                                         AliasAnalysis *AA) const override;
-
-  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
-                               int64_t &Offset1,
-                               int64_t &Offset2) const override;
-
-  bool getLdStBaseRegImmOfs(MachineInstr *LdSt,
-                            unsigned &BaseReg, unsigned &Offset,
-                            const TargetRegisterInfo *TRI) const final;
-
-  bool shouldClusterLoads(MachineInstr *FirstLdSt,
-                          MachineInstr *SecondLdSt,
-                          unsigned NumLoads) const final;
-
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator MI, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
-                   bool KillSrc) const override;
-
-  unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI,
-                                    RegScavenger *RS,
-                                    unsigned TmpReg,
-                                    unsigned Offset,
-                                    unsigned Size) const;
-
-  void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
-                           const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const override;
-
-  void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            unsigned DestReg, int FrameIndex,
-                            const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const override;
-
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
-
-  // \brief Returns an opcode that can be used to move a value to a \p DstRC
-  // register.  If there is no hardware instruction that can store to \p
-  // DstRC, then AMDGPU::COPY is returned.
-  unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
-  unsigned commuteOpcode(const MachineInstr &MI) const;
-
-  MachineInstr *commuteInstruction(MachineInstr *MI,
-                                   bool NewMI = false) const override;
-  bool findCommutedOpIndices(MachineInstr *MI,
-                             unsigned &SrcOpIdx1,
-                             unsigned &SrcOpIdx2) const override;
-
-  bool isTriviallyReMaterializable(const MachineInstr *MI,
-                                   AliasAnalysis *AA = nullptr) const;
-
-  bool areMemAccessesTriviallyDisjoint(
-    MachineInstr *MIa, MachineInstr *MIb,
-    AliasAnalysis *AA = nullptr) const override;
-
-  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
-                              MachineBasicBlock::iterator I,
-                              unsigned DstReg, unsigned SrcReg) const override;
-  bool isMov(unsigned Opcode) const override;
-
-  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
-
-  bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
-                     unsigned Reg, MachineRegisterInfo *MRI) const final;
-
-  unsigned getMachineCSELookAheadLimit() const override { return 500; }
-
-  bool isSALU(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::SALU;
-  }
-
-  bool isVALU(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::VALU;
-  }
-
-  bool isSOP1(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::SOP1;
-  }
-
-  bool isSOP2(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::SOP2;
-  }
-
-  bool isSOPC(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::SOPC;
-  }
-
-  bool isSOPK(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::SOPK;
-  }
-
-  bool isSOPP(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::SOPP;
-  }
-
-  bool isVOP1(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::VOP1;
-  }
-
-  bool isVOP2(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::VOP2;
-  }
-
-  bool isVOP3(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::VOP3;
-  }
-
-  bool isVOPC(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::VOPC;
-  }
-
-  bool isMUBUF(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
-  }
-
-  bool isMTBUF(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
-  }
-
-  bool isSMRD(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::SMRD;
-  }
-
-  bool isDS(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::DS;
-  }
-
-  bool isMIMG(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::MIMG;
-  }
-
-  bool isFLAT(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::FLAT;
-  }
-
-  bool isWQM(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::WQM;
-  }
-
-  bool isVGPRSpill(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
-  }
-
-  bool isInlineConstant(const APInt &Imm) const;
-  bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const;
-  bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;
-
-  bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
-                         const MachineOperand &MO) const;
-
-  /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
-  /// This function will return false if you pass it a 32-bit instruction.
-  bool hasVALU32BitEncoding(unsigned Opcode) const;
-
-  /// \brief Returns true if this operand uses the constant bus.
-  bool usesConstantBus(const MachineRegisterInfo &MRI,
-                       const MachineOperand &MO,
-                       unsigned OpSize) const;
-
-  /// \brief Return true if this instruction has any modifiers.
-  ///  e.g. src[012]_mod, omod, clamp.
-  bool hasModifiers(unsigned Opcode) const;
-
-  bool hasModifiersSet(const MachineInstr &MI,
-                       unsigned OpName) const;
-
-  bool verifyInstruction(const MachineInstr *MI,
-                         StringRef &ErrInfo) const override;
-
-  static unsigned getVALUOp(const MachineInstr &MI);
-
-  bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
-
-  /// \brief Return the correct register class for \p OpNo.  For target-specific
-  /// instructions, this will return the register class that has been defined
-  /// in tablegen.  For generic instructions, like REG_SEQUENCE it will return
-  /// the register class of its machine operand.
-  /// to infer the correct register class base on the other operands.
-  const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
-                                           unsigned OpNo) const;
-
-  /// \brief Return the size in bytes of the operand OpNo on the given
-  // instruction opcode.
-  unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const {
-    const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo];
-
-    if (OpInfo.RegClass == -1) {
-      // If this is an immediate operand, this must be a 32-bit literal.
-      assert(OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE);
-      return 4;
-    }
-
-    return RI.getRegClass(OpInfo.RegClass)->getSize();
-  }
-
-  /// \brief This form should usually be preferred since it handles operands
-  /// with unknown register classes.
-  unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
-    return getOpRegClass(MI, OpNo)->getSize();
-  }
-
-  /// \returns true if it is legal for the operand at index \p OpNo
-  /// to read a VGPR.
-  bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const;
-
-  /// \brief Legalize the \p OpIndex operand of this instruction by inserting
-  /// a MOV.  For example:
-  /// ADD_I32_e32 VGPR0, 15
-  /// to
-  /// MOV VGPR1, 15
-  /// ADD_I32_e32 VGPR0, VGPR1
-  ///
-  /// If the operand being legalized is a register, then a COPY will be used
-  /// instead of MOV.
-  void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const;
-
-  /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand
-  /// for \p MI.
-  bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
-                      const MachineOperand *MO = nullptr) const;
-
-  /// \brief Legalize all operands in this instruction.  This function may
-  /// create new instruction and insert them before \p MI.
-  void legalizeOperands(MachineInstr *MI) const;
-
-  /// \brief Split an SMRD instruction into two smaller loads of half the
-  //  size storing the results in \p Lo and \p Hi.
-  void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC,
-                 unsigned HalfImmOp, unsigned HalfSGPROp,
-                 MachineInstr *&Lo, MachineInstr *&Hi) const;
-
-  void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
-
-  /// \brief Replace this instruction's opcode with the equivalent VALU
-  /// opcode.  This function will also move the users of \p MI to the
-  /// VALU if necessary.
-  void moveToVALU(MachineInstr &MI) const;
-
-  unsigned calculateIndirectAddress(unsigned RegIndex,
-                                    unsigned Channel) const override;
-
-  const TargetRegisterClass *getIndirectAddrRegClass() const override;
-
-  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                         MachineBasicBlock::iterator I,
-                                         unsigned ValueReg,
-                                         unsigned Address,
-                                         unsigned OffsetReg) const override;
-
-  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
-                                        MachineBasicBlock::iterator I,
-                                        unsigned ValueReg,
-                                        unsigned Address,
-                                        unsigned OffsetReg) const override;
-  void reserveIndirectRegisters(BitVector &Reserved,
-                                const MachineFunction &MF) const;
-
-  void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
-              unsigned SavReg, unsigned IndexReg) const;
-
-  void insertNOPs(MachineBasicBlock::iterator MI, int Count) const;
-
-  /// \brief Returns the operand named \p Op.  If \p MI does not have an
-  /// operand named \c Op, this function returns nullptr.
-  MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const;
-
-  const MachineOperand *getNamedOperand(const MachineInstr &MI,
-                                        unsigned OpName) const {
-    return getNamedOperand(const_cast<MachineInstr &>(MI), OpName);
-  }
-
-  uint64_t getDefaultRsrcDataFormat() const;
-
-};
-
-namespace AMDGPU {
-
-  int getVOPe64(uint16_t Opcode);
-  int getVOPe32(uint16_t Opcode);
-  int getCommuteRev(uint16_t Opcode);
-  int getCommuteOrig(uint16_t Opcode);
-  int getAddr64Inst(uint16_t Opcode);
-  int getAtomicRetOp(uint16_t Opcode);
-  int getAtomicNoRetOp(uint16_t Opcode);
-
-  const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
-  const uint64_t RSRC_TID_ENABLE = 1LL << 55;
-
-} // End namespace AMDGPU
-
-namespace SI {
-namespace KernelInputOffsets {
-
-/// Offsets in bytes from the start of the input buffer
-enum Offsets {
-  NGROUPS_X = 0,
-  NGROUPS_Y = 4,
-  NGROUPS_Z = 8,
-  GLOBAL_SIZE_X = 12,
-  GLOBAL_SIZE_Y = 16,
-  GLOBAL_SIZE_Z = 20,
-  LOCAL_SIZE_X = 24,
-  LOCAL_SIZE_Y = 28,
-  LOCAL_SIZE_Z = 32
-};
-
-} // End namespace KernelInputOffsets
-} // End namespace SI
-
-} // End namespace llvm
-
-#endif
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
deleted file mode 100644
index 93e4ca74ec3..00000000000
--- a/lib/Target/R600/SIInstrInfo.td
+++ /dev/null
@@ -1,2647 +0,0 @@
-//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-def isCI : Predicate<"Subtarget->getGeneration() "
-                      ">= AMDGPUSubtarget::SEA_ISLANDS">;
-def isVI : Predicate <
-  "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
-  AssemblerPredicate<"FeatureGCN3Encoding">;
-
-def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
-
-class vop {
-  field bits<9> SI3;
-  field bits<10> VI3;
-}
-
-class vopc <bits<8> si, bits<8> vi = !add(0x40, si)> : vop {
-  field bits<8> SI = si;
-  field bits<8> VI = vi;
-
-  field bits<9>  SI3 = {0, si{7-0}};
-  field bits<10> VI3 = {0, 0, vi{7-0}};
-}
-
-class vop1 <bits<8> si, bits<8> vi = si> : vop {
-  field bits<8> SI = si;
-  field bits<8> VI = vi;
-
-  field bits<9>  SI3 = {1, 1, si{6-0}};
-  field bits<10> VI3 = !add(0x140, vi);
-}
-
-class vop2 <bits<6> si, bits<6> vi = si> : vop {
-  field bits<6> SI = si;
-  field bits<6> VI = vi;
-
-  field bits<9>  SI3 = {1, 0, 0, si{5-0}};
-  field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}};
-}
-
-// Specify a VOP2 opcode for SI and VOP3 opcode for VI
-// that doesn't have VOP2 encoding on VI
-class vop23 <bits<6> si, bits<10> vi> : vop2 <si> {
-  let VI3 = vi;
-}
-
-class vop3 <bits<9> si, bits<10> vi = {0, si}> : vop {
-  let SI3 = si;
-  let VI3 = vi;
-}
-
-class sop1 <bits<8> si, bits<8> vi = si> {
-  field bits<8> SI = si;
-  field bits<8> VI = vi;
-}
-
-class sop2 <bits<7> si, bits<7> vi = si> {
-  field bits<7> SI = si;
-  field bits<7> VI = vi;
-}
-
-class sopk <bits<5> si, bits<5> vi = si> {
-  field bits<5> SI = si;
-  field bits<5> VI = vi;
-}
-
-// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
-// in AMDGPUInstrInfo.cpp
-def SISubtarget {
-  int NONE = -1;
-  int SI = 0;
-  int VI = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// SI DAG Nodes
-//===----------------------------------------------------------------------===//
-
-def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
-  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
-                      [SDNPMayLoad, SDNPMemOperand]
->;
-
-def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
-  SDTypeProfile<0, 13,
-    [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
-     SDTCisVT<1, iAny>,   // vdata(VGPR)
-     SDTCisVT<2, i32>,    // num_channels(imm)
-     SDTCisVT<3, i32>,    // vaddr(VGPR)
-     SDTCisVT<4, i32>,    // soffset(SGPR)
-     SDTCisVT<5, i32>,    // inst_offset(imm)
-     SDTCisVT<6, i32>,    // dfmt(imm)
-     SDTCisVT<7, i32>,    // nfmt(imm)
-     SDTCisVT<8, i32>,    // offen(imm)
-     SDTCisVT<9, i32>,    // idxen(imm)
-     SDTCisVT<10, i32>,   // glc(imm)
-     SDTCisVT<11, i32>,   // slc(imm)
-     SDTCisVT<12, i32>    // tfe(imm)
-    ]>,
-  [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
-def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
-  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>,
-                       SDTCisVT<3, i32>]>
->;
-
-class SDSample<string opcode> : SDNode <opcode,
-  SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>,
-                       SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
->;
-
-def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
-def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
-def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
-def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
-
-def SIconstdata_ptr : SDNode<
-  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
->;
-
-//===----------------------------------------------------------------------===//
-// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
-// to be glued to the memory instructions.
-//===----------------------------------------------------------------------===//
-
-def SIld_local : SDNode <"ISD::LOAD", SDTLoad,
-  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
->;
-
-def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{
-  return isLocalLoad(cast<LoadSDNode>(N));
-}]>;
-
-def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
-  return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED &&
-         cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
-}]>;
-
-def si_load_local_align8 : Aligned8Bytes <
-  (ops node:$ptr), (si_load_local node:$ptr)
->;
-
-def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
-  return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
-}]>;
-def si_az_extload_local : AZExtLoadBase <si_ld_local>;
-
-multiclass SIExtLoadLocal <PatFrag ld_node> {
-
-  def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
-                     [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;}]
-  >;
-
-  def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
-                     [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;}]
-  >;
-}
-
-defm si_sextload_local : SIExtLoadLocal <si_sextload_local>;
-defm si_az_extload_local : SIExtLoadLocal <si_az_extload_local>;
-
-def SIst_local : SDNode <"ISD::STORE", SDTStore,
-  [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
->;
-
-def si_st_local : PatFrag <
-  (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{
-  return isLocalStore(cast<StoreSDNode>(N));
-}]>;
-
-def si_store_local : PatFrag <
-  (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED &&
-         !cast<StoreSDNode>(N)->isTruncatingStore();
-}]>;
-
-def si_store_local_align8 : Aligned8Bytes <
-  (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr)
->;
-
-def si_truncstore_local : PatFrag <
-  (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->isTruncatingStore();
-}]>;
-
-def si_truncstore_local_i8 : PatFrag <
-  (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
-
-def si_truncstore_local_i16 : PatFrag <
-  (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
-
-multiclass SIAtomicM0Glue2 <string op_name> {
-
-  def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2,
-    [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
-  >;
-
-  def _local : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
-}
-
-defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
-defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
-defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
-defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
-defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">;
-defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
-defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
-defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
-defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
-defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">;
-
-def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
-  [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
->;
-
-defm si_atomic_cmp_swap : AtomicCmpSwapLocal <si_atomic_cmp_swap_glue>;
-
-// Transformation function, extract the lower 32bit of a 64bit immediate
-def LO32 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, SDLoc(N),
-                                   MVT::i32);
-}]>;
-
-def LO32f : SDNodeXForm<fpimm, [{
-  APInt V = N->getValueAPF().bitcastToAPInt().trunc(32);
-  return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32);
-}]>;
-
-// Transformation function, extract the upper 32bit of a 64bit immediate
-def HI32 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() >> 32, SDLoc(N), MVT::i32);
-}]>;
-
-def HI32f : SDNodeXForm<fpimm, [{
-  APInt V = N->getValueAPF().bitcastToAPInt().lshr(32).trunc(32);
-  return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), SDLoc(N),
-                                     MVT::f32);
-}]>;
-
-def IMM8bitDWORD : PatLeaf <(imm),
-  [{return (N->getZExtValue() & ~0x3FC) == 0;}]
->;
-
-def as_dword_i32imm : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() >> 2, SDLoc(N), MVT::i32);
-}]>;
-
-def as_i1imm : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
-}]>;
-
-def as_i8imm : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i8);
-}]>;
-
-def as_i16imm : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16);
-}]>;
-
-def as_i32imm: SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
-}]>;
-
-def as_i64imm: SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64);
-}]>;
-
-// Copied from the AArch64 backend:
-def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
-return CurDAG->getTargetConstant(
-  N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
-}]>;
-
-// Copied from the AArch64 backend:
-def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
-return CurDAG->getTargetConstant(
-  N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
-}]>;
-
-def IMM8bit : PatLeaf <(imm),
-  [{return isUInt<8>(N->getZExtValue());}]
->;
-
-def IMM12bit : PatLeaf <(imm),
-  [{return isUInt<12>(N->getZExtValue());}]
->;
-
-def IMM16bit : PatLeaf <(imm),
-  [{return isUInt<16>(N->getZExtValue());}]
->;
-
-def IMM20bit : PatLeaf <(imm),
-  [{return isUInt<20>(N->getZExtValue());}]
->;
-
-def IMM32bit : PatLeaf <(imm),
-  [{return isUInt<32>(N->getZExtValue());}]
->;
-
-def mubuf_vaddr_offset : PatFrag<
-  (ops node:$ptr, node:$offset, node:$imm_offset),
-  (add (add node:$ptr, node:$offset), node:$imm_offset)
->;
-
-class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
-  return isInlineImmediate(N);
-}]>;
-
-class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
-  return isInlineImmediate(N);
-}]>;
-
-class SGPRImm <dag frag> : PatLeaf<frag, [{
-  if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    return false;
-  }
-  const SIRegisterInfo *SIRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
-  for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
-                                                U != E; ++U) {
-    if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) {
-      return true;
-    }
-  }
-  return false;
-}]>;
-
-//===----------------------------------------------------------------------===//
-// Custom Operands
-//===----------------------------------------------------------------------===//
-
-def FRAMEri32 : Operand<iPTR> {
-  let MIOperandInfo = (ops i32:$ptr, i32imm:$index);
-}
-
-def SoppBrTarget : AsmOperandClass {
-  let Name = "SoppBrTarget";
-  let ParserMethod = "parseSOppBrTarget";
-}
-
-def sopp_brtarget : Operand<OtherVT> {
-  let EncoderMethod = "getSOPPBrEncoding";
-  let OperandType = "OPERAND_PCREL";
-  let ParserMatchClass = SoppBrTarget;
-}
-
-include "SIInstrFormats.td"
-include "VIInstrFormats.td"
-
-def MubufOffsetMatchClass : AsmOperandClass {
-  let Name = "MubufOffset";
-  let ParserMethod = "parseMubufOptionalOps";
-  let RenderMethod = "addImmOperands";
-}
-
-class DSOffsetBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "DSOffset"#parser;
-  let ParserMethod = parser;
-  let RenderMethod = "addImmOperands";
-  let PredicateMethod = "isDSOffset";
-}
-
-def DSOffsetMatchClass : DSOffsetBaseMatchClass <"parseDSOptionalOps">;
-def DSOffsetGDSMatchClass : DSOffsetBaseMatchClass <"parseDSOffsetOptional">;
-
-def DSOffset01MatchClass : AsmOperandClass {
-  let Name = "DSOffset1";
-  let ParserMethod = "parseDSOff01OptionalOps";
-  let RenderMethod = "addImmOperands";
-  let PredicateMethod = "isDSOffset01";
-}
-
-class GDSBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "GDS"#parser;
-  let PredicateMethod = "isImm";
-  let ParserMethod = parser;
-  let RenderMethod = "addImmOperands";
-}
-
-def GDSMatchClass : GDSBaseMatchClass <"parseDSOptionalOps">;
-def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">;
-
-class GLCBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "GLC"#parser;
-  let PredicateMethod = "isImm";
-  let ParserMethod = parser; 
-  let RenderMethod = "addImmOperands";
-}
-
-def GLCMubufMatchClass : GLCBaseMatchClass <"parseMubufOptionalOps">;
-def GLCFlatMatchClass : GLCBaseMatchClass <"parseFlatOptionalOps">;
-
-class SLCBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "SLC"#parser;
-  let PredicateMethod = "isImm";
-  let ParserMethod = parser;
-  let RenderMethod = "addImmOperands";
-}
-
-def SLCMubufMatchClass : SLCBaseMatchClass <"parseMubufOptionalOps">;
-def SLCFlatMatchClass : SLCBaseMatchClass <"parseFlatOptionalOps">;
-def SLCFlatAtomicMatchClass : SLCBaseMatchClass <"parseFlatAtomicOptionalOps">;
-
-class TFEBaseMatchClass <string parser> : AsmOperandClass {
-  let Name = "TFE"#parser;
-  let PredicateMethod = "isImm";
-  let ParserMethod = parser;
-  let RenderMethod = "addImmOperands";
-}
-
-def TFEMubufMatchClass : TFEBaseMatchClass <"parseMubufOptionalOps">;
-def TFEFlatMatchClass : TFEBaseMatchClass <"parseFlatOptionalOps">;
-def TFEFlatAtomicMatchClass : TFEBaseMatchClass <"parseFlatAtomicOptionalOps">;
-
-def OModMatchClass : AsmOperandClass {
-  let Name = "OMod";
-  let PredicateMethod = "isImm";
-  let ParserMethod = "parseVOP3OptionalOps";
-  let RenderMethod = "addImmOperands";
-}
-
-def ClampMatchClass : AsmOperandClass {
-  let Name = "Clamp";
-  let PredicateMethod = "isImm";
-  let ParserMethod = "parseVOP3OptionalOps";
-  let RenderMethod = "addImmOperands";
-}
-
-let OperandType = "OPERAND_IMMEDIATE" in {
-
-def offen : Operand<i1> {
-  let PrintMethod = "printOffen";
-}
-def idxen : Operand<i1> {
-  let PrintMethod = "printIdxen";
-}
-def addr64 : Operand<i1> {
-  let PrintMethod = "printAddr64";
-}
-def mbuf_offset : Operand<i16> {
-  let PrintMethod = "printMBUFOffset";
-  let ParserMatchClass = MubufOffsetMatchClass;
-}
-class ds_offset_base <AsmOperandClass mc> : Operand<i16> {
-  let PrintMethod = "printDSOffset";
-  let ParserMatchClass = mc;
-}
-def ds_offset : ds_offset_base <DSOffsetMatchClass>;
-def ds_offset_gds : ds_offset_base <DSOffsetGDSMatchClass>;
-
-def ds_offset0 : Operand<i8> {
-  let PrintMethod = "printDSOffset0";
-  let ParserMatchClass = DSOffset01MatchClass;
-}
-def ds_offset1 : Operand<i8> {
-  let PrintMethod = "printDSOffset1";
-  let ParserMatchClass = DSOffset01MatchClass;
-}
-class gds_base <AsmOperandClass mc> : Operand <i1> {
-  let PrintMethod = "printGDS";
-  let ParserMatchClass = mc;
-}
-def gds : gds_base <GDSMatchClass>;
-
-def gds01 : gds_base <GDS01MatchClass>;
-
-class glc_base <AsmOperandClass mc> : Operand <i1> {
-  let PrintMethod = "printGLC";
-  let ParserMatchClass = mc;
-}
-
-def glc : glc_base <GLCMubufMatchClass>;
-def glc_flat : glc_base <GLCFlatMatchClass>;
-
-class slc_base <AsmOperandClass mc> : Operand <i1> {
-  let PrintMethod = "printSLC";
-  let ParserMatchClass = mc;
-}
-
-def slc : slc_base <SLCMubufMatchClass>;
-def slc_flat : slc_base <SLCFlatMatchClass>;
-def slc_flat_atomic : slc_base <SLCFlatAtomicMatchClass>;
-
-class tfe_base <AsmOperandClass mc> : Operand <i1> {
-  let PrintMethod = "printTFE";
-  let ParserMatchClass = mc;
-}
-
-def tfe : tfe_base <TFEMubufMatchClass>;
-def tfe_flat : tfe_base <TFEFlatMatchClass>;
-def tfe_flat_atomic : tfe_base <TFEFlatAtomicMatchClass>;
-
-def omod : Operand <i32> {
-  let PrintMethod = "printOModSI";
-  let ParserMatchClass = OModMatchClass;
-}
-
-def ClampMod : Operand <i1> {
-  let PrintMethod = "printClampSI";
-  let ParserMatchClass = ClampMatchClass;
-}
-
-} // End OperandType = "OPERAND_IMMEDIATE"
-
-def VOPDstS64 : VOPDstOperand <SReg_64>;
-
-//===----------------------------------------------------------------------===//
-// Complex patterns
-//===----------------------------------------------------------------------===//
-
-def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
-def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
-
-def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
-def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
-def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
-def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
-def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
-def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
-
-def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
-def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
-def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;
-def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
-
-//===----------------------------------------------------------------------===//
-// SI assembler operands
-//===----------------------------------------------------------------------===//
-
-def SIOperand {
-  int ZERO = 0x80;
-  int VCC = 0x6A;
-  int FLAT_SCR = 0x68;
-}
-
-def SRCMODS {
-  int NONE = 0;
-  int NEG = 1;
-}
-
-def DSTCLAMP {
-  int NONE = 0;
-}
-
-def DSTOMOD {
-  int NONE = 0;
-}
-
-//===----------------------------------------------------------------------===//
-//
-// SI Instruction multiclass helpers.
-//
-// Instructions with _32 take 32-bit operands.
-// Instructions with _64 take 64-bit operands.
-//
-// VOP_* instructions can use either a 32-bit or 64-bit encoding.  The 32-bit
-// encoding is the standard encoding, but instruction that make use of
-// any of the instruction modifiers must use the 64-bit encoding.
-//
-// Instructions with _e32 use the 32-bit encoding.
-// Instructions with _e64 use the 64-bit encoding.
-//
-//===----------------------------------------------------------------------===//
-
-class SIMCInstr <string pseudo, int subtarget> {
-  string PseudoInstr = pseudo;
-  int Subtarget = subtarget;
-}
-
-//===----------------------------------------------------------------------===//
-// EXP classes
-//===----------------------------------------------------------------------===//
-
-class EXPCommon : InstSI<
-  (outs),
-  (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
-       VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3),
-  "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
-  [] > {
-
-  let EXP_CNT = 1;
-  let Uses = [EXEC];
-}
-
-multiclass EXP_m {
-
-  let isPseudo = 1, isCodeGenOnly = 1 in {
-    def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ;
-  }
-
-  def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe;
-
-  def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi;
-}
-
-//===----------------------------------------------------------------------===//
-// Scalar classes
-//===----------------------------------------------------------------------===//
-
-class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
-  SOP1 <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-}
-
-class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm> :
-  SOP1 <outs, ins, asm, []>,
-  SOP1e <op.SI>,
-  SIMCInstr<opName, SISubtarget.SI> {
-  let isCodeGenOnly = 0;
-  let AssemblerPredicates = [isSICI];
-}
-
-class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm> :
-  SOP1 <outs, ins, asm, []>,
-  SOP1e <op.VI>,
-  SIMCInstr<opName, SISubtarget.VI> {
-  let isCodeGenOnly = 0;
-  let AssemblerPredicates = [isVI];
-}
-
-multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm,
-                   list<dag> pattern> {
-
-  def "" : SOP1_Pseudo <opName, outs, ins, pattern>;
-
-  def _si : SOP1_Real_si <op, opName, outs, ins, asm>;
-
-  def _vi : SOP1_Real_vi <op, opName, outs, ins, asm>;
-
-}
-
-multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
-    op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0),
-    opName#" $dst, $src0", pattern
->;
-
-multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
-    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0),
-    opName#" $dst, $src0", pattern
->;
-
-// no input, 64-bit output.
-multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> {
-  def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins), pattern>;
-
-  def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins),
-    opName#" $dst"> {
-    let ssrc0 = 0;
-  }
-
-  def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins),
-    opName#" $dst"> {
-    let ssrc0 = 0;
-  }
-}
-
-// 64-bit input, no output
-multiclass SOP1_1 <sop1 op, string opName, list<dag> pattern> {
-  def "" : SOP1_Pseudo <opName, (outs), (ins SReg_64:$src0), pattern>;
-
-  def _si : SOP1_Real_si <op, opName, (outs), (ins SReg_64:$src0),
-    opName#" $src0"> {
-    let sdst = 0;
-  }
-
-  def _vi : SOP1_Real_vi <op, opName, (outs), (ins SReg_64:$src0),
-    opName#" $src0"> {
-    let sdst = 0;
-  }
-}
-
-// 64-bit input, 32-bit output.
-multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
-    op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0),
-    opName#" $dst, $src0", pattern
->;
-
-class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> :
-  SOP2<outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-  let Size = 4;
-
-  // Pseudo instructions have no encodings, but adding this field here allows
-  // us to do:
-  // let sdst = xxx in {
-  // for multiclasses that include both real and pseudo instructions.
-  field bits<7> sdst = 0;
-}
-
-class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm> :
-  SOP2<outs, ins, asm, []>,
-  SOP2e<op.SI>,
-  SIMCInstr<opName, SISubtarget.SI> {
-  let AssemblerPredicates = [isSICI];
-}
-
-class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> :
-  SOP2<outs, ins, asm, []>,
-  SOP2e<op.VI>,
-  SIMCInstr<opName, SISubtarget.VI> {
-  let AssemblerPredicates = [isVI];
-}
-
-multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> {
-  def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst),
-    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>;
-
-  def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst),
-    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
-    opName#" $dst, $src0, $src1 [$scc]">;
-
-  def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst),
-    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
-    opName#" $dst, $src0, $src1 [$scc]">;
-}
-
-multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm,
-                   list<dag> pattern> {
-
-  def "" : SOP2_Pseudo <opName, outs, ins, pattern>;
-
-  def _si : SOP2_Real_si <op, opName, outs, ins, asm>;
-
-  def _vi : SOP2_Real_vi <op, opName, outs, ins, asm>;
-
-}
-
-multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
-    op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
-    opName#" $dst, $src0, $src1", pattern
->;
-
-multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
-    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
-    opName#" $dst, $src0, $src1", pattern
->;
-
-multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
-    op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
-    opName#" $dst, $src0, $src1", pattern
->;
-
-class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
-                    string opName, PatLeaf cond> : SOPC <
-  op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1),
-  opName#" $src0, $src1", []>;
-
-class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
-  : SOPC_Helper<op, SSrc_32, i32, opName, cond>;
-
-class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL>
-  : SOPC_Helper<op, SSrc_64, i64, opName, cond>;
-
-class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
-  SOPK <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-}
-
-class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm> :
-  SOPK <outs, ins, asm, []>,
-  SOPKe <op.SI>,
-  SIMCInstr<opName, SISubtarget.SI> {
-  let AssemblerPredicates = [isSICI];
-  let isCodeGenOnly = 0;
-}
-
-class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm> :
-  SOPK <outs, ins, asm, []>,
-  SOPKe <op.VI>,
-  SIMCInstr<opName, SISubtarget.VI> {
-  let AssemblerPredicates = [isVI];
-  let isCodeGenOnly = 0;
-}
-
-multiclass SOPK_m <sopk op, string opName, dag outs, dag ins, string opAsm,
-                   string asm = opName#opAsm> {
-  def "" : SOPK_Pseudo <opName, outs, ins, []>;
-
-  def _si : SOPK_Real_si <op, opName, outs, ins, asm>;
-
-  def _vi : SOPK_Real_vi <op, opName, outs, ins, asm>;
-
-}
-
-multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> {
-  def "" : SOPK_Pseudo <opName, (outs SReg_32:$dst), (ins u16imm:$src0),
-    pattern>;
-
-  def _si : SOPK_Real_si <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0),
-    opName#" $dst, $src0">;
-
-  def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0),
-    opName#" $dst, $src0">;
-}
-
-multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> {
-  def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst),
-    (ins SReg_32:$src0, u16imm:$src1), pattern>;
-
-  let DisableEncoding = "$dst" in {
-    def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst),
-      (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">;
-
-    def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst),
-      (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">;
-  }
-}
-
-multiclass SOPK_32TIE <sopk op, string opName, list<dag> pattern> : SOPK_m <
-  op, opName, (outs SReg_32:$sdst), (ins SReg_32:$src0, u16imm:$simm16),
-  " $sdst, $simm16"
->;
-
-multiclass SOPK_IMM32 <sopk op, string opName, dag outs, dag ins,
-                       string argAsm, string asm = opName#argAsm> {
-
-  def "" : SOPK_Pseudo <opName, outs, ins, []>;
-
-  def _si : SOPK <outs, ins, asm, []>,
-            SOPK64e <op.SI>,
-            SIMCInstr<opName, SISubtarget.SI> {
-              let AssemblerPredicates = [isSICI];
-              let isCodeGenOnly = 0;
-            }
-
-  def _vi : SOPK <outs, ins, asm, []>,
-            SOPK64e <op.VI>,
-            SIMCInstr<opName, SISubtarget.VI> {
-              let AssemblerPredicates = [isVI];
-              let isCodeGenOnly = 0;
-            }
-}
-//===----------------------------------------------------------------------===//
-// SMRD classes
-//===----------------------------------------------------------------------===//
-
-class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
-  SMRD <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-}
-
-class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins,
-                    string asm> :
-  SMRD <outs, ins, asm, []>,
-  SMRDe <op, imm>,
-  SIMCInstr<opName, SISubtarget.SI> {
-  let AssemblerPredicates = [isSICI];
-}
-
-class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins,
-                    string asm> :
-  SMRD <outs, ins, asm, []>,
-  SMEMe_vi <op, imm>,
-  SIMCInstr<opName, SISubtarget.VI> {
-  let AssemblerPredicates = [isVI];
-}
-
-multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins,
-                   string asm, list<dag> pattern> {
-
-  def "" : SMRD_Pseudo <opName, outs, ins, pattern>;
-
-  def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>;
-
-  // glc is only applicable to scalar stores, which are not yet
-  // implemented.
-  let glc = 0 in {
-    def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>;
-  }
-}
-
-multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass,
-                        RegisterClass dstClass> {
-  defm _IMM : SMRD_m <
-    op, opName#"_IMM", 1, (outs dstClass:$dst),
-    (ins baseClass:$sbase, u32imm:$offset),
-    opName#" $dst, $sbase, $offset", []
-  >;
-
-  defm _SGPR : SMRD_m <
-    op, opName#"_SGPR", 0, (outs dstClass:$dst),
-    (ins baseClass:$sbase, SReg_32:$soff),
-    opName#" $dst, $sbase, $soff", []
-  >;
-}
-
-//===----------------------------------------------------------------------===//
-// Vector ALU classes
-//===----------------------------------------------------------------------===//
-
-// This must always be right before the operand being input modified.
-def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> {
-  let PrintMethod = "printOperandAndMods";
-}
-
-def InputModsMatchClass : AsmOperandClass {
-  let Name = "RegWithInputMods";
-}
-
-def InputModsNoDefault : Operand <i32> {
-  let PrintMethod = "printOperandAndMods";
-  let ParserMatchClass = InputModsMatchClass;
-}
-
-class getNumSrcArgs<ValueType Src1, ValueType Src2> {
-  int ret =
-    !if (!eq(Src1.Value, untyped.Value),      1,   // VOP1
-         !if (!eq(Src2.Value, untyped.Value), 2,   // VOP2
-                                              3)); // VOP3
-}
-
-// Returns the register class to use for the destination of VOP[123C]
-// instructions for the given VT.
-class getVALUDstForVT<ValueType VT> {
-  RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
-                          !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
-                            VOPDstOperand<SReg_64>)); // else VT == i1
-}
-
-// Returns the register class to use for source 0 of VOP[12C]
-// instructions for the given VT.
-class getVOPSrc0ForVT<ValueType VT> {
-  RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64);
-}
-
-// Returns the register class to use for source 1 of VOP[12C] for the
-// given VT.
-class getVOPSrc1ForVT<ValueType VT> {
-  RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64);
-}
-
-// Returns the register class to use for sources of VOP3 instructions for the
-// given VT.
-class getVOP3SrcForVT<ValueType VT> {
-  RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64);
-}
-
-// Returns 1 if the source arguments have modifiers, 0 if they do not.
-class hasModifiers<ValueType SrcVT> {
-  bit ret = !if(!eq(SrcVT.Value, f32.Value), 1,
-            !if(!eq(SrcVT.Value, f64.Value), 1, 0));
-}
-
-// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
-class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
-  dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0),               // VOP1
-            !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2
-                                    (ins)));
-}
-
-// Returns the input arguments for VOP3 instructions for the given SrcVT.
-class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
-                RegisterOperand Src2RC, int NumSrcArgs,
-                bit HasModifiers> {
-
-  dag ret =
-    !if (!eq(NumSrcArgs, 1),
-      !if (!eq(HasModifiers, 1),
-        // VOP1 with modifiers
-        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
-             ClampMod:$clamp, omod:$omod)
-      /* else */,
-        // VOP1 without modifiers
-        (ins Src0RC:$src0)
-      /* endif */ ),
-    !if (!eq(NumSrcArgs, 2),
-      !if (!eq(HasModifiers, 1),
-        // VOP 2 with modifiers
-        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
-             InputModsNoDefault:$src1_modifiers, Src1RC:$src1,
-             ClampMod:$clamp, omod:$omod)
-      /* else */,
-        // VOP2 without modifiers
-        (ins Src0RC:$src0, Src1RC:$src1)
-      /* endif */ )
-    /* NumSrcArgs == 3 */,
-      !if (!eq(HasModifiers, 1),
-        // VOP3 with modifiers
-        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
-             InputModsNoDefault:$src1_modifiers, Src1RC:$src1,
-             InputModsNoDefault:$src2_modifiers, Src2RC:$src2,
-             ClampMod:$clamp, omod:$omod)
-      /* else */,
-        // VOP3 without modifiers
-        (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2)
-      /* endif */ )));
-}
-
-// Returns the assembly string for the inputs and outputs of a VOP[12C]
-// instruction.  This does not add the _e32 suffix, so it can be reused
-// by getAsm64.
-class getAsm32 <int NumSrcArgs> {
-  string src1 = ", $src1";
-  string src2 = ", $src2";
-  string ret = "$dst, $src0"#
-               !if(!eq(NumSrcArgs, 1), "", src1)#
-               !if(!eq(NumSrcArgs, 3), src2, "");
-}
-
-// Returns the assembly string for the inputs and outputs of a VOP3
-// instruction.
-class getAsm64 <int NumSrcArgs, bit HasModifiers> {
-  string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
-  string src1 = !if(!eq(NumSrcArgs, 1), "",
-                   !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
-                                           " $src1_modifiers,"));
-  string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
-  string ret =
-  !if(!eq(HasModifiers, 0),
-      getAsm32<NumSrcArgs>.ret,
-      "$dst, "#src0#src1#src2#"$clamp"#"$omod");
-}
-
-
-class VOPProfile <list<ValueType> _ArgVT> {
-
-  field list<ValueType> ArgVT = _ArgVT;
-
-  field ValueType DstVT = ArgVT[0];
-  field ValueType Src0VT = ArgVT[1];
-  field ValueType Src1VT = ArgVT[2];
-  field ValueType Src2VT = ArgVT[3];
-  field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
-  field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
-  field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret;
-  field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
-  field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
-  field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
-
-  field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret;
-  field bit HasModifiers = hasModifiers<Src0VT>.ret;
-
-  field dag Outs = (outs DstRC:$dst);
-
-  field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
-  field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
-                             HasModifiers>.ret;
-
-  field string Asm32 = getAsm32<NumSrcArgs>.ret;
-  field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret;
-}
-
-// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order
-//        for the instruction patterns to work.
-def VOP_F16_F16 : VOPProfile <[f32, f32, untyped, untyped]>;
-def VOP_F16_I16 : VOPProfile <[f32, i32, untyped, untyped]>;
-def VOP_I16_F16 : VOPProfile <[i32, f32, untyped, untyped]>;
-
-def VOP_F16_F16_F16 : VOPProfile <[f32, f32, f32, untyped]>;
-def VOP_F16_F16_I16 : VOPProfile <[f32, f32, i32, untyped]>;
-def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
-
-def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
-def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>;
-def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>;
-def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>;
-def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>;
-def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>;
-def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>;
-def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
-def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
-
-def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
-def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>;
-def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>;
-def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
-def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
-def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
-def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
-def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> {
-  let Src0RC32 = VCSrc_32;
-}
-
-def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> {
-  let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
-  let Asm64 = "$dst, $src0_modifiers, $src1";
-}
-
-def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> {
-  let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
-  let Asm64 = "$dst, $src0_modifiers, $src1";
-}
-
-def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
-def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
-def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
-def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> {
-  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2);
-  let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2);
-  let Asm64 = "$dst, $src0, $src1, $src2";
-}
-
-def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
-def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> {
-  field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2);
-  field string Asm = "$dst, $src0, $vsrc1, $src2";
-}
-def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
-def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
-def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
-
-
-class VOP <string opName> {
-  string OpName = opName;
-}
-
-class VOP2_REV <string revOp, bit isOrig> {
-  string RevOp = revOp;
-  bit IsOrig = isOrig;
-}
-
-class AtomicNoRet <string noRetOp, bit isRet> {
-  string NoRetOp = noRetOp;
-  bit IsRet = isRet;
-}
-
-class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
-  VOP1Common <outs, ins, "", pattern>,
-  VOP <opName>,
-  SIMCInstr <opName#"_e32", SISubtarget.NONE>,
-  MnemonicAlias<opName#"_e32", opName> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-
-  field bits<8> vdst;
-  field bits<9> src0;
-}
-
-class VOP1_Real_si <string opName, vop1 op, dag outs, dag ins, string asm> :
-  VOP1<op.SI, outs, ins, asm, []>,
-  SIMCInstr <opName#"_e32", SISubtarget.SI> {
-  let AssemblerPredicate = SIAssemblerPredicate;
-}
-
-class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> :
-  VOP1<op.VI, outs, ins, asm, []>,
-  SIMCInstr <opName#"_e32", SISubtarget.VI> {
-  let AssemblerPredicates = [isVI];
-}
-
-multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName> {
-  def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
-
-  def _si : VOP1_Real_si <opName, op, outs, ins, asm>;
-
-  def _vi : VOP1_Real_vi <opName, op, outs, ins, asm>;
-}
-
-multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName> {
-  def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
-
-  def _si : VOP1_Real_si <opName, op, outs, ins, asm>;
-}
-
-class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
-  VOP2Common <outs, ins, "", pattern>,
-  VOP <opName>,
-  SIMCInstr<opName#"_e32", SISubtarget.NONE>,
-  MnemonicAlias<opName#"_e32", opName> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-}
-
-class VOP2_Real_si <string opName, vop2 op, dag outs, dag ins, string asm> :
-  VOP2 <op.SI, outs, ins, opName#asm, []>,
-  SIMCInstr <opName#"_e32", SISubtarget.SI> {
-  let AssemblerPredicates = [isSICI];
-}
-
-class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> :
-  VOP2 <op.VI, outs, ins, opName#asm, []>,
-  SIMCInstr <opName#"_e32", SISubtarget.VI> {
-  let AssemblerPredicates = [isVI];
-}
-
-multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
-                     string opName, string revOp> {
-  def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
-           VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
-
-  def _si : VOP2_Real_si <opName, op, outs, ins, asm>;
-}
-
-multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName, string revOp> {
-  def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
-           VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
-
-  def _si : VOP2_Real_si <opName, op, outs, ins, asm>;
-
-  def _vi : VOP2_Real_vi <opName, op, outs, ins, asm>;
-
-}
-
-class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> {
-
-  bits<2> src0_modifiers = !if(HasModifiers, ?, 0);
-  bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0);
-  bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ?, 0), 0);
-  bits<2> omod = !if(HasModifiers, ?, 0);
-  bits<1> clamp = !if(HasModifiers, ?, 0);
-  bits<9> src1 = !if(HasSrc1, ?, 0);
-  bits<9> src2 = !if(HasSrc2, ?, 0);
-}
-
-class VOP3DisableModFields <bit HasSrc0Mods,
-                            bit HasSrc1Mods = 0,
-                            bit HasSrc2Mods = 0,
-                            bit HasOutputMods = 0> {
-  bits<2> src0_modifiers = !if(HasSrc0Mods, ?, 0);
-  bits<2> src1_modifiers = !if(HasSrc1Mods, ?, 0);
-  bits<2> src2_modifiers = !if(HasSrc2Mods, ?, 0);
-  bits<2> omod = !if(HasOutputMods, ?, 0);
-  bits<1> clamp = !if(HasOutputMods, ?, 0);
-}
-
-class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
-  VOP3Common <outs, ins, "", pattern>,
-  VOP <opName>,
-  SIMCInstr<opName#"_e64", SISubtarget.NONE>,
-  MnemonicAlias<opName#"_e64", opName> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-}
-
-class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
-  VOP3Common <outs, ins, asm, []>,
-  VOP3e <op>,
-  SIMCInstr<opName#"_e64", SISubtarget.SI> {
-  let AssemblerPredicates = [isSICI];
-}
-
-class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> :
-  VOP3Common <outs, ins, asm, []>,
-  VOP3e_vi <op>,
-  SIMCInstr <opName#"_e64", SISubtarget.VI> {
-  let AssemblerPredicates = [isVI];
-}
-
-class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
-  VOP3Common <outs, ins, asm, []>,
-  VOP3be <op>,
-  SIMCInstr<opName#"_e64", SISubtarget.SI> {
-  let AssemblerPredicates = [isSICI];
-}
-
-class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> :
-  VOP3Common <outs, ins, asm, []>,
-  VOP3be_vi <op>,
-  SIMCInstr <opName#"_e64", SISubtarget.VI> {
-  let AssemblerPredicates = [isVI];
-}
-
-multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName, int NumSrcArgs, bit HasMods = 1> {
-
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
-
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
-            VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
-                              !if(!eq(NumSrcArgs, 2), 0, 1),
-                              HasMods>;
-  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
-            VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
-                              !if(!eq(NumSrcArgs, 2), 0, 1),
-                              HasMods>;
-}
-
-// VOP3_m without source modifiers
-multiclass VOP3_m_nomods <vop op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName, int NumSrcArgs, bit HasMods = 1> {
-
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
-
-  let src0_modifiers = 0,
-      src1_modifiers = 0,
-      src2_modifiers = 0,
-      clamp = 0,
-      omod = 0 in {
-    def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>;
-    def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>;
-  }
-}
-
-multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
-                     list<dag> pattern, string opName, bit HasMods = 1> {
-
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
-
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
-            VOP3DisableFields<0, 0, HasMods>;
-
-  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
-            VOP3DisableFields<0, 0, HasMods>;
-}
-
-multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm,
-                     list<dag> pattern, string opName, bit HasMods = 1> {
-
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
-
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
-            VOP3DisableFields<0, 0, HasMods>;
-  // No VI instruction. This class is for SI only.
-}
-
-multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
-                     list<dag> pattern, string opName, string revOp,
-                     bit HasMods = 1, bit UseFullOp = 0> {
-
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
-           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
-
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
-            VOP3DisableFields<1, 0, HasMods>;
-
-  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
-            VOP3DisableFields<1, 0, HasMods>;
-}
-
-multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
-                     list<dag> pattern, string opName, string revOp,
-                     bit HasMods = 1, bit UseFullOp = 0> {
-
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
-           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
-
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
-            VOP3DisableFields<1, 0, HasMods>;
-
-  // No VI instruction. This class is for SI only.
-}
-
-// XXX - Is v_div_scale_{f32|f64} only available in vop3b without
-// option of implicit vcc use?
-multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm,
-                      list<dag> pattern, string opName, string revOp,
-                      bit HasMods = 1, bit UseFullOp = 0> {
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
-           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
-
-  // The VOP2 variant puts the carry out into VCC, the VOP3 variant
-  // can write it into any SGPR. We currently don't use the carry out,
-  // so for now hardcode it to VCC as well.
-  let sdst = SIOperand.VCC, Defs = [VCC] in {
-    def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
-              VOP3DisableFields<1, 0, HasMods>;
-
-    def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
-              VOP3DisableFields<1, 0, HasMods>;
-  } // End sdst = SIOperand.VCC, Defs = [VCC]
-}
-
-multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm,
-                      list<dag> pattern, string opName, string revOp,
-                      bit HasMods = 1, bit UseFullOp = 0> {
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
-
-
-  def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
-            VOP3DisableFields<1, 1, HasMods>;
-
-  def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
-            VOP3DisableFields<1, 1, HasMods>;
-}
-
-multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
-                     list<dag> pattern, string opName,
-                     bit HasMods, bit defExec, string revOp> {
-
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
-           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
-
-  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
-            VOP3DisableFields<1, 0, HasMods> {
-    let Defs = !if(defExec, [EXEC], []);
-  }
-
-  def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
-            VOP3DisableFields<1, 0, HasMods> {
-    let Defs = !if(defExec, [EXEC], []);
-  }
-}
-
-// An instruction that is VOP2 on SI and VOP3 on VI, no modifiers.
-multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins,
-                         string asm, list<dag> pattern = []> {
-  let isPseudo = 1, isCodeGenOnly = 1 in {
-    def "" : VOPAnyCommon <outs, ins, "", pattern>,
-             SIMCInstr<opName, SISubtarget.NONE>;
-  }
-
-  def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>,
-            SIMCInstr <opName, SISubtarget.SI> {
-            let AssemblerPredicates = [isSICI];
-  }
-
-  def _vi : VOP3Common <outs, ins, asm, []>,
-            VOP3e_vi <op.VI3>,
-            VOP3DisableFields <1, 0, 0>,
-            SIMCInstr <opName, SISubtarget.VI> {
-            let AssemblerPredicates = [isVI];
-  }
-}
-
-multiclass VOP1_Helper <vop1 op, string opName, dag outs,
-                        dag ins32, string asm32, list<dag> pat32,
-                        dag ins64, string asm64, list<dag> pat64,
-                        bit HasMods> {
-
-  defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>;
-
-  defm _e64 : VOP3_1_m <op, outs, ins64, opName#asm64, pat64, opName, HasMods>;
-}
-
-multiclass VOP1Inst <vop1 op, string opName, VOPProfile P,
-                     SDPatternOperator node = null_frag> : VOP1_Helper <
-  op, opName, P.Outs,
-  P.Ins32, P.Asm32, [],
-  P.Ins64, P.Asm64,
-  !if(P.HasModifiers,
-      [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
-                                i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0))]),
-  P.HasModifiers
->;
-
-multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
-                       SDPatternOperator node = null_frag> {
-
-  defm _e32 : VOP1SI_m <op, P.Outs, P.Ins32, opName#P.Asm32, [], opName>;
-
-  defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64,
-    !if(P.HasModifiers,
-      [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
-                                i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0))]),
-    opName, P.HasModifiers>;
-}
-
-multiclass VOP2_Helper <vop2 op, string opName, dag outs,
-                        dag ins32, string asm32, list<dag> pat32,
-                        dag ins64, string asm64, list<dag> pat64,
-                        string revOp, bit HasMods> {
-  defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
-
-  defm _e64 : VOP3_2_m <op,
-    outs, ins64, opName#asm64, pat64, opName, revOp, HasMods
-  >;
-}
-
-multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
-                     SDPatternOperator node = null_frag,
-                     string revOp = opName> : VOP2_Helper <
-  op, opName, P.Outs,
-  P.Ins32, P.Asm32, [],
-  P.Ins64, P.Asm64,
-  !if(P.HasModifiers,
-      [(set P.DstVT:$dst,
-           (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                      i1:$clamp, i32:$omod)),
-                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
-  revOp, P.HasModifiers
->;
-
-multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
-                       SDPatternOperator node = null_frag,
-                       string revOp = opName> {
-  defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>;
-
-  defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64,
-    !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
-             (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                        i1:$clamp, i32:$omod)),
-                   (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
-    opName, revOp, P.HasModifiers>;
-}
-
-multiclass VOP2b_Helper <vop2 op, string opName, dag outs,
-                         dag ins32, string asm32, list<dag> pat32,
-                         dag ins64, string asm64, list<dag> pat64,
-                         string revOp, bit HasMods> {
-
-  defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
-
-  defm _e64 : VOP3b_2_m <op,
-    outs, ins64, opName#asm64, pat64, opName, revOp, HasMods
-  >;
-}
-
-multiclass VOP2bInst <vop2 op, string opName, VOPProfile P,
-                      SDPatternOperator node = null_frag,
-                      string revOp = opName> : VOP2b_Helper <
-  op, opName, P.Outs,
-  P.Ins32, P.Asm32, [],
-  P.Ins64, P.Asm64,
-  !if(P.HasModifiers,
-      [(set P.DstVT:$dst,
-           (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                      i1:$clamp, i32:$omod)),
-                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
-  revOp, P.HasModifiers
->;
-
-// A VOP2 instruction that is VOP3-only on VI.
-multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs,
-                            dag ins32, string asm32, list<dag> pat32,
-                            dag ins64, string asm64, list<dag> pat64,
-                            string revOp, bit HasMods> {
-  defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>;
-
-  defm _e64 : VOP3_2_m <op, outs, ins64, opName#asm64, pat64, opName,
-                        revOp, HasMods>;
-}
-
-multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P,
-                          SDPatternOperator node = null_frag,
-                          string revOp = opName>
-                          : VOP2_VI3_Helper <
-  op, opName, P.Outs,
-  P.Ins32, P.Asm32, [],
-  P.Ins64, P.Asm64,
-  !if(P.HasModifiers,
-      [(set P.DstVT:$dst,
-           (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                      i1:$clamp, i32:$omod)),
-                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
-  revOp, P.HasModifiers
->;
-
-multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> {
-
-  def "" : VOP2_Pseudo <VOP_MADK.Outs, VOP_MADK.Ins, pattern, opName>;
-
-let isCodeGenOnly = 0 in {
-  def _si : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins,
-                        !strconcat(opName, VOP_MADK.Asm), []>,
-            SIMCInstr <opName#"_e32", SISubtarget.SI>,
-            VOP2_MADKe <op.SI> {
-            let AssemblerPredicates = [isSICI];
-            }
-
-  def _vi : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins,
-                        !strconcat(opName, VOP_MADK.Asm), []>,
-            SIMCInstr <opName#"_e32", SISubtarget.VI>,
-            VOP2_MADKe <op.VI> {
-            let AssemblerPredicates = [isVI];
-            }
-} // End isCodeGenOnly = 0
-}
-
-class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
-  VOPCCommon <ins, "", pattern>,
-  VOP <opName>,
-  SIMCInstr<opName#"_e32", SISubtarget.NONE>,
-  MnemonicAlias<opName#"_e32", opName> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-}
-
-multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName, bit DefExec, string revOpName = ""> {
-  def "" : VOPC_Pseudo <outs, ins, pattern, opName>;
-
-  def _si : VOPC<op.SI, ins, asm, []>,
-            SIMCInstr <opName#"_e32", SISubtarget.SI> {
-    let Defs = !if(DefExec, [EXEC], []);
-    let hasSideEffects = DefExec;
-  }
-
-  def _vi : VOPC<op.VI, ins, asm, []>,
-            SIMCInstr <opName#"_e32", SISubtarget.VI> {
-    let Defs = !if(DefExec, [EXEC], []);
-    let hasSideEffects = DefExec;
-  }
-}
-
-multiclass VOPC_Helper <vopc op, string opName,
-                        dag ins32, string asm32, list<dag> pat32,
-                        dag out64, dag ins64, string asm64, list<dag> pat64,
-                        bit HasMods, bit DefExec, string revOp> {
-  defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
-
-  defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64,
-                        opName, HasMods, DefExec, revOp>;
-}
-
-// Special case for class instructions which only have modifiers on
-// the 1st source operand.
-multiclass VOPC_Class_Helper <vopc op, string opName,
-                             dag ins32, string asm32, list<dag> pat32,
-                             dag out64, dag ins64, string asm64, list<dag> pat64,
-                             bit HasMods, bit DefExec, string revOp> {
-  defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
-
-  defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64,
-                        opName, HasMods, DefExec, revOp>,
-                        VOP3DisableModFields<1, 0, 0>;
-}
-
-multiclass VOPCInst <vopc op, string opName,
-                     VOPProfile P, PatLeaf cond = COND_NULL,
-                     string revOp = opName,
-                     bit DefExec = 0> : VOPC_Helper <
-  op, opName,
-  P.Ins32, P.Asm32, [],
-  (outs VOPDstS64:$dst), P.Ins64, P.Asm64,
-  !if(P.HasModifiers,
-      [(set i1:$dst,
-          (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                      i1:$clamp, i32:$omod)),
-                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
-                 cond))],
-      [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]),
-  P.HasModifiers, DefExec, revOp
->;
-
-multiclass VOPCClassInst <vopc op, string opName, VOPProfile P,
-                     bit DefExec = 0> : VOPC_Class_Helper <
-  op, opName,
-  P.Ins32, P.Asm32, [],
-  (outs VOPDstS64:$dst), P.Ins64, P.Asm64,
-  !if(P.HasModifiers,
-      [(set i1:$dst,
-          (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))],
-      [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
-  P.HasModifiers, DefExec, opName
->;
-
-
-multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
-  VOPCInst <op, opName, VOP_F32_F32_F32, cond, revOp>;
-
-multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
-  VOPCInst <op, opName, VOP_F64_F64_F64, cond, revOp>;
-
-multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
-  VOPCInst <op, opName, VOP_I32_I32_I32, cond, revOp>;
-
-multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
-  VOPCInst <op, opName, VOP_I64_I64_I64, cond, revOp>;
-
-
-multiclass VOPCX <vopc op, string opName, VOPProfile P,
-                  PatLeaf cond = COND_NULL,
-                  string revOp = "">
-  : VOPCInst <op, opName, P, cond, revOp, 1>;
-
-multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> :
-  VOPCX <op, opName, VOP_F32_F32_F32, COND_NULL, revOp>;
-
-multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> :
-  VOPCX <op, opName, VOP_F64_F64_F64, COND_NULL, revOp>;
-
-multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> :
-  VOPCX <op, opName, VOP_I32_I32_I32, COND_NULL, revOp>;
-
-multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> :
-  VOPCX <op, opName, VOP_I64_I64_I64, COND_NULL, revOp>;
-
-multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
-                        list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m <
-    op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods
->;
-
-multiclass VOPC_CLASS_F32 <vopc op, string opName> :
-  VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>;
-
-multiclass VOPCX_CLASS_F32 <vopc op, string opName> :
-  VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>;
-
-multiclass VOPC_CLASS_F64 <vopc op, string opName> :
-  VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>;
-
-multiclass VOPCX_CLASS_F64 <vopc op, string opName> :
-  VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>;
-
-multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
-                     SDPatternOperator node = null_frag> : VOP3_Helper <
-  op, opName, (outs P.DstRC.RegClass:$dst), P.Ins64, P.Asm64,
-  !if(!eq(P.NumSrcArgs, 3),
-    !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
-            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                       i1:$clamp, i32:$omod)),
-                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
-                  (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1,
-                                  P.Src2VT:$src2))]),
-  !if(!eq(P.NumSrcArgs, 2),
-    !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
-            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                       i1:$clamp, i32:$omod)),
-                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))])
-  /* P.NumSrcArgs == 1 */,
-    !if(P.HasModifiers,
-        [(set P.DstVT:$dst,
-            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                       i1:$clamp, i32:$omod))))],
-        [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))),
-  P.NumSrcArgs, P.HasModifiers
->;
-
-// Special case for v_div_fmas_{f32|f64}, since it seems to be the
-// only VOP instruction that implicitly reads VCC.
-multiclass VOP3_VCC_Inst <vop3 op, string opName,
-                          VOPProfile P,
-                          SDPatternOperator node = null_frag> : VOP3_Helper <
-  op, opName,
-  (outs P.DstRC.RegClass:$dst),
-  (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0,
-       InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1,
-       InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2,
-       ClampMod:$clamp,
-       omod:$omod),
-  " $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod",
-  [(set P.DstVT:$dst,
-            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                       i1:$clamp, i32:$omod)),
-                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
-                  (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)),
-                  (i1 VCC)))],
-  3, 1
->;
-
-multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc,
-                    string opName, list<dag> pattern> :
-  VOP3b_3_m <
-  op, (outs vrc:$vdst, SReg_64:$sdst),
-      (ins InputModsNoDefault:$src0_modifiers, arc:$src0,
-           InputModsNoDefault:$src1_modifiers, arc:$src1,
-           InputModsNoDefault:$src2_modifiers, arc:$src2,
-           ClampMod:$clamp, omod:$omod),
-  opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern,
-  opName, opName, 1, 1
->;
-
-multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> :
-  VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>;
-
-multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> :
-  VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>;
-
-
-class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
-  (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
-        (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
-        (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))),
-  (Inst i32:$src0_modifiers, P.Src0VT:$src0,
-        i32:$src1_modifiers, P.Src1VT:$src1,
-        i32:$src2_modifiers, P.Src2VT:$src2,
-        i1:$clamp,
-        i32:$omod)>;
-
-//===----------------------------------------------------------------------===//
-// Interpolation opcodes
-//===----------------------------------------------------------------------===//
-
-class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
-  VINTRPCommon <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-}
-
-class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins,
-                      string asm> :
-  VINTRPCommon <outs, ins, asm, []>,
-  VINTRPe <op>,
-  SIMCInstr<opName, SISubtarget.SI>;
-
-class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
-                      string asm> :
-  VINTRPCommon <outs, ins, asm, []>,
-  VINTRPe_vi <op>,
-  SIMCInstr<opName, SISubtarget.VI>;
-
-multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm,
-                     list<dag> pattern = []> {
-  def "" : VINTRP_Pseudo <NAME, outs, ins, pattern>;
-
-  def _si : VINTRP_Real_si <op, NAME, outs, ins, asm>;
-
-  def _vi : VINTRP_Real_vi <op, NAME, outs, ins, asm>;
-}
-
-//===----------------------------------------------------------------------===//
-// Vector I/O classes
-//===----------------------------------------------------------------------===//
-
-class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
-  DS <outs, ins, "", pattern>,
-  SIMCInstr <opName, SISubtarget.NONE> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-}
-
-class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
-  DS <outs, ins, asm, []>,
-  DSe <op>,
-  SIMCInstr <opName, SISubtarget.SI> {
-  let isCodeGenOnly = 0;
-}
-
-class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
-  DS <outs, ins, asm, []>,
-  DSe_vi <op>,
-  SIMCInstr <opName, SISubtarget.VI>;
-
-class DS_Off16_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
-  DS_Real_si <op,opName, outs, ins, asm> {
-
-  // Single load interpret the 2 i8imm operands as a single i16 offset.
-  bits<16> offset;
-  let offset0 = offset{7-0};
-  let offset1 = offset{15-8};
-  let isCodeGenOnly = 0;
-}
-
-class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
-  DS_Real_vi <op, opName, outs, ins, asm> {
-
-  // Single load interpret the 2 i8imm operands as a single i16 offset.
-  bits<16> offset;
-  let offset0 = offset{7-0};
-  let offset1 = offset{15-8};
-}
-
-multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc,
-  dag outs = (outs rc:$vdst),
-  dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds),
-  string asm = opName#" $vdst, $addr"#"$offset$gds"> {
-
-  def "" : DS_Pseudo <opName, outs, ins, []>;
-
-  let data0 = 0, data1 = 0 in {
-    def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
-    def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
-  }
-}
-
-multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc,
-  dag outs = (outs rc:$vdst),
-  dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1,
-                 gds01:$gds),
-  string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> {
-
-  def "" : DS_Pseudo <opName, outs, ins, []>;
-
-  let data0 = 0, data1 = 0, AsmMatchConverter = "cvtDSOffset01" in {
-    def _si : DS_Real_si <op, opName, outs, ins, asm>;
-    def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
-  }
-}
-
-multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc,
-  dag outs = (outs),
-  dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
-  string asm = opName#" $addr, $data0"#"$offset$gds"> {
-
-  def "" : DS_Pseudo <opName, outs, ins, []>,
-           AtomicNoRet<opName, 0>;
-
-  let data1 = 0, vdst = 0 in {
-    def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
-    def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
-  }
-}
-
-multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc,
-  dag outs = (outs),
-  dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
-              ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds),
-  string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> {
-
-  def "" : DS_Pseudo <opName, outs, ins, []>;
-
-  let vdst = 0, AsmMatchConverter = "cvtDSOffset01" in {
-    def _si : DS_Real_si <op, opName, outs, ins, asm>;
-    def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
-  }
-}
-
-multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc,
-                        string noRetOp = "",
-  dag outs = (outs rc:$vdst),
-  dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
-  string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> {
-
-  def "" : DS_Pseudo <opName, outs, ins, []>,
-           AtomicNoRet<noRetOp, 1>;
-
-  let data1 = 0 in {
-    def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
-    def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
-  }
-}
-
-multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc,
-                          string noRetOp = "", dag ins,
-  dag outs = (outs rc:$vdst),
-  string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> {
-
-  def "" : DS_Pseudo <opName, outs, ins, []>,
-           AtomicNoRet<noRetOp, 1>;
-
-  def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
-  def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
-}
-
-multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc,
-                        string noRetOp = "", RegisterClass src = rc> :
-  DS_1A2D_RET_m <op, asm, rc, noRetOp,
-                 (ins VGPR_32:$addr, src:$data0, src:$data1,
-                      ds_offset:$offset, gds:$gds)
->;
-
-multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc,
-                          string noRetOp = opName,
-  dag outs = (outs),
-  dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1,
-                 ds_offset:$offset, gds:$gds),
-  string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> {
-
-  def "" : DS_Pseudo <opName, outs, ins, []>,
-           AtomicNoRet<noRetOp, 0>;
-
-  let vdst = 0 in {
-    def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
-    def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
-  }
-}
-
-multiclass DS_0A_RET <bits<8> op, string opName,
-  dag outs = (outs VGPR_32:$vdst),
-  dag ins = (ins ds_offset:$offset, gds:$gds),
-  string asm = opName#" $vdst"#"$offset"#"$gds"> {
-
-  let mayLoad = 1, mayStore = 1 in {
-    def "" : DS_Pseudo <opName, outs, ins, []>;
-
-    let addr = 0, data0 = 0, data1 = 0 in {
-      def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
-      def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
-    } // end addr = 0, data0 = 0, data1 = 0
-  } // end mayLoad = 1, mayStore = 1
-}
-
-multiclass DS_1A_RET_GDS <bits<8> op, string opName,
-  dag outs = (outs VGPR_32:$vdst),
-  dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset),
-  string asm = opName#" $vdst, $addr"#"$offset gds"> {
-
-  def "" : DS_Pseudo <opName, outs, ins, []>;
-
-  let data0 = 0, data1 = 0, gds = 1 in {
-    def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
-    def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
-  } // end data0 = 0, data1 = 0, gds = 1
-}
-
-multiclass DS_1A_GDS <bits<8> op, string opName,
-  dag outs = (outs),
-  dag ins = (ins VGPR_32:$addr),
-  string asm = opName#" $addr gds"> {
-
-  def "" : DS_Pseudo <opName, outs, ins, []>;
-
-  let vdst = 0, data0 = 0, data1 = 0, offset0 = 0, offset1 = 0, gds = 1 in {
-    def _si : DS_Real_si <op, opName, outs, ins, asm>;
-    def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
-  } // end vdst = 0, data = 0, data1 = 0, gds = 1
-}
-
-multiclass DS_1A <bits<8> op, string opName,
-  dag outs = (outs),
-  dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds),
-  string asm = opName#" $addr"#"$offset"#"$gds"> {
-
-  let mayLoad = 1, mayStore = 1 in {
-    def "" : DS_Pseudo <opName, outs, ins, []>;
-
-    let vdst = 0, data0 = 0, data1 = 0 in {
-      def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
-      def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
-    } // let vdst = 0, data0 = 0, data1 = 0
-  } // end mayLoad = 1, mayStore = 1
-}
-
-//===----------------------------------------------------------------------===//
-// MTBUF classes
-//===----------------------------------------------------------------------===//
-
-class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
-  MTBUF <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-}
-
-class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins,
-                    string asm> :
-  MTBUF <outs, ins, asm, []>,
-  MTBUFe <op>,
-  SIMCInstr<opName, SISubtarget.SI>;
-
-class MTBUF_Real_vi <bits<4> op, string opName, dag outs, dag ins, string asm> :
-  MTBUF <outs, ins, asm, []>,
-  MTBUFe_vi <op>,
-  SIMCInstr <opName, SISubtarget.VI>;
-
-multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm,
-                    list<dag> pattern> {
-
-  def "" : MTBUF_Pseudo <opName, outs, ins, pattern>;
-
-  def _si : MTBUF_Real_si <op, opName, outs, ins, asm>;
-
-  def _vi : MTBUF_Real_vi <{0, op{2}, op{1}, op{0}}, opName, outs, ins, asm>;
-
-}
-
-let mayStore = 1, mayLoad = 0 in {
-
-multiclass MTBUF_Store_Helper <bits<3> op, string opName,
-                               RegisterClass regClass> : MTBUF_m <
-  op, opName, (outs),
-  (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
-   i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
-   SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset),
-  opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
-        #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []
->;
-
-} // mayStore = 1, mayLoad = 0
-
-let mayLoad = 1, mayStore = 0 in {
-
-multiclass MTBUF_Load_Helper <bits<3> op, string opName,
-                              RegisterClass regClass> : MTBUF_m <
-  op, opName, (outs regClass:$dst),
-  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-       i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
-       i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset),
-  opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
-        #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []
->;
-
-} // mayLoad = 1, mayStore = 0
-
-//===----------------------------------------------------------------------===//
-// MUBUF classes
-//===----------------------------------------------------------------------===//
-
-class mubuf <bits<7> si, bits<7> vi = si> {
-  field bits<7> SI = si;
-  field bits<7> VI = vi;
-}
-
-let isCodeGenOnly = 0 in {
-
-class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-  MUBUF <outs, ins, asm, pattern>, MUBUFe <op> {
-  let lds  = 0;
-}
-
-} // End let isCodeGenOnly = 0
-
-class MUBUF_vi <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-  MUBUF <outs, ins, asm, pattern>, MUBUFe_vi <op> {
-  let lds = 0;
-}
-
-class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
-  bit IsAddr64 = is_addr64;
-  string OpName = NAME # suffix;
-}
-
-class MUBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
-  MUBUF <outs, ins, "", pattern>,
-  SIMCInstr<opName, SISubtarget.NONE> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-
-  // dummy fields, so that we can use let statements around multiclasses
-  bits<1> offen;
-  bits<1> idxen;
-  bits<8> vaddr;
-  bits<1> glc;
-  bits<1> slc;
-  bits<1> tfe;
-  bits<8> soffset;
-}
-
-class MUBUF_Real_si <mubuf op, string opName, dag outs, dag ins,
-                     string asm> :
-  MUBUF <outs, ins, asm, []>,
-  MUBUFe <op.SI>,
-  SIMCInstr<opName, SISubtarget.SI> {
-  let lds = 0;
-}
-
-class MUBUF_Real_vi <mubuf op, string opName, dag outs, dag ins,
-                     string asm> :
-  MUBUF <outs, ins, asm, []>,
-  MUBUFe_vi <op.VI>,
-  SIMCInstr<opName, SISubtarget.VI> {
-  let lds = 0;
-}
-
-multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm,
-                    list<dag> pattern> {
-
-  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
-           MUBUFAddr64Table <0>;
-
-  let addr64 = 0, isCodeGenOnly = 0 in {
-    def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
-  }
-
-  def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>;
-}
-
-multiclass MUBUFAddr64_m <mubuf op, string opName, dag outs,
-                          dag ins, string asm, list<dag> pattern> {
-
-  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
-           MUBUFAddr64Table <1>;
-
-  let addr64 = 1, isCodeGenOnly = 0 in {
-    def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
-  }
-
-  // There is no VI version. If the pseudo is selected, it should be lowered
-  // for VI appropriately.
-}
-
-multiclass MUBUFAtomicOffset_m <mubuf op, string opName, dag outs, dag ins,
-                                string asm, list<dag> pattern, bit is_return> {
-
-  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
-           MUBUFAddr64Table <0, !if(is_return, "_RTN", "")>,
-           AtomicNoRet<NAME#"_OFFSET", is_return>;
-
-  let offen = 0, idxen = 0, tfe = 0, vaddr = 0 in {
-    let addr64 = 0 in {
-      def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
-    }
-
-    def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>;
-  }
-}
-
-multiclass MUBUFAtomicAddr64_m <mubuf op, string opName, dag outs, dag ins,
-                                string asm, list<dag> pattern, bit is_return> {
-
-  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
-           MUBUFAddr64Table <1, !if(is_return, "_RTN", "")>,
-           AtomicNoRet<NAME#"_ADDR64", is_return>;
-
-  let offen = 0, idxen = 0, addr64 = 1, tfe = 0 in {
-    def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
-  }
-
-  // There is no VI version. If the pseudo is selected, it should be lowered
-  // for VI appropriately.
-}
-
-multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
-                         ValueType vt, SDPatternOperator atomic> {
-
-  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in {
-
-    // No return variants
-    let glc = 0 in {
-
-      defm _ADDR64 : MUBUFAtomicAddr64_m <
-        op, name#"_addr64", (outs),
-        (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
-             SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
-        name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0
-      >;
-
-      defm _OFFSET : MUBUFAtomicOffset_m <
-        op, name#"_offset", (outs),
-        (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset,
-             slc:$slc),
-        name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0
-      >;
-    } // glc = 0
-
-    // Variant that return values
-    let glc = 1, Constraints = "$vdata = $vdata_in",
-        DisableEncoding = "$vdata_in"  in {
-
-      defm _RTN_ADDR64 : MUBUFAtomicAddr64_m <
-        op, name#"_rtn_addr64", (outs rc:$vdata),
-        (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr,
-             SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
-        name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc",
-        [(set vt:$vdata,
-         (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset,
-	                            i16:$offset, i1:$slc), vt:$vdata_in))], 1
-      >;
-
-      defm _RTN_OFFSET : MUBUFAtomicOffset_m <
-        op, name#"_rtn_offset", (outs rc:$vdata),
-        (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset,
-             mbuf_offset:$offset, slc:$slc),
-        name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc",
-        [(set vt:$vdata,
-         (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
-                                    i1:$slc), vt:$vdata_in))], 1
-      >;
-
-    } // glc = 1
-
-  } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1
-}
-
-multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
-                              ValueType load_vt = i32,
-                              SDPatternOperator ld = null_frag> {
-
-  let mayLoad = 1, mayStore = 0 in {
-    let offen = 0, idxen = 0, vaddr = 0 in {
-      defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata),
-                           (ins SReg_128:$srsrc, SCSrc_32:$soffset,
-                           mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
-                           name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
-                           [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc,
-                                                     i32:$soffset, i16:$offset,
-                                                     i1:$glc, i1:$slc, i1:$tfe)))]>;
-    }
-
-    let offen = 1, idxen = 0  in {
-      defm _OFFEN  : MUBUF_m <op, name#"_offen", (outs regClass:$vdata),
-                           (ins VGPR_32:$vaddr, SReg_128:$srsrc,
-                           SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc,
-                           tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
-    }
-
-    let offen = 0, idxen = 1 in {
-      defm _IDXEN  : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata),
-                           (ins VGPR_32:$vaddr, SReg_128:$srsrc,
-                           SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc,
-                           slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
-    }
-
-    let offen = 1, idxen = 1 in {
-      defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata),
-                           (ins VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
-                           mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
-    }
-
-    let offen = 0, idxen = 0 in {
-      defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata),
-                           (ins VReg_64:$vaddr, SReg_128:$srsrc,
-                                SCSrc_32:$soffset, mbuf_offset:$offset,
-				glc:$glc, slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#
-                                "$glc"#"$slc"#"$tfe",
-                           [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc,
-                                                  i64:$vaddr, i32:$soffset,
-                                                  i16:$offset, i1:$glc, i1:$slc,
-						  i1:$tfe)))]>;
-    }
-  }
-}
-
-multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass,
-                          ValueType store_vt = i32, SDPatternOperator st = null_frag> {
-  let mayLoad = 0, mayStore = 1 in {
-    defm : MUBUF_m <op, name, (outs),
-                    (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
-                    mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc,
-                    tfe:$tfe),
-                    name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"#
-                         "$glc"#"$slc"#"$tfe", []>;
-
-    let offen = 0, idxen = 0, vaddr = 0 in {
-      defm _OFFSET : MUBUF_m <op, name#"_offset",(outs),
-                              (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset,
-                              mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
-                              name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
-                              [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
-                                   i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>;
-    } // offen = 0, idxen = 0, vaddr = 0
-
-    let offen = 1, idxen = 0  in {
-      defm _OFFEN : MUBUF_m <op, name#"_offen", (outs),
-                             (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc,
-                              SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc,
-                              slc:$slc, tfe:$tfe),
-                             name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#
-                             "$glc"#"$slc"#"$tfe", []>;
-    } // end offen = 1, idxen = 0
-
-    let offen = 0, idxen = 1 in {
-      defm _IDXEN  : MUBUF_m <op, name#"_idxen", (outs),
-                           (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc,
-                           SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc,
-                           slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
-    }
-
-    let offen = 1, idxen = 1 in {
-      defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs),
-                           (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset,
-                           mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
-                           name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
-    }
-
-    let offen = 0, idxen = 0 in {
-      defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs),
-                                    (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc,
-                                         SCSrc_32:$soffset,
-                                         mbuf_offset:$offset, glc:$glc, slc:$slc,
-                                         tfe:$tfe),
-                                    name#" $vdata, $vaddr, $srsrc, $soffset addr64"#
-                                         "$offset"#"$glc"#"$slc"#"$tfe",
-                                    [(st store_vt:$vdata,
-                                      (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr,
-                                                   i32:$soffset, i16:$offset,
-                                                   i1:$glc, i1:$slc, i1:$tfe))]>;
-    }
-  } // End mayLoad = 0, mayStore = 1
-}
-
-class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
-      FLAT <op, (outs regClass:$vdst),
-                (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
-            asm#" $vdst, $addr"#"$glc"#"$slc"#"$tfe", []> {
-  let data = 0;
-  let mayLoad = 1;
-}
-
-class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
-      FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr,
-                             glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
-          name#" $data, $addr"#"$glc"#"$slc"#"$tfe",
-         []> {
-
-  let mayLoad = 0;
-  let mayStore = 1;
-
-  // Encoding
-  let vdst = 0;
-}
-
-multiclass FLAT_ATOMIC <bits<7> op, string name, RegisterClass vdst_rc,
-                        RegisterClass data_rc = vdst_rc> {
-
-  let mayLoad = 1, mayStore = 1 in {
-    def "" : FLAT <op, (outs),
-                  (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
-                       tfe_flat_atomic:$tfe),
-                   name#" $addr, $data"#"$slc"#"$tfe", []>,
-             AtomicNoRet <NAME, 0> {
-      let glc = 0;
-      let vdst = 0;
-    }
-
-    def _RTN : FLAT <op, (outs vdst_rc:$vdst),
-                     (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
-                          tfe_flat_atomic:$tfe),
-                     name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>,
-               AtomicNoRet <NAME, 1> {
-      let glc = 1;
-    }
-  }
-}
-
-class MIMG_Mask <string op, int channels> {
-  string Op = op;
-  int Channels = channels;
-}
-
-class MIMG_NoSampler_Helper <bits<7> op, string asm,
-                             RegisterClass dst_rc,
-                             RegisterClass src_rc> : MIMG <
-  op,
-  (outs dst_rc:$vdata),
-  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
-       SReg_256:$srsrc),
-  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
-     #" $tfe, $lwe, $slc, $vaddr, $srsrc",
-  []> {
-  let ssamp = 0;
-  let mayLoad = 1;
-  let mayStore = 0;
-  let hasPostISelHook = 1;
-}
-
-multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
-                                      RegisterClass dst_rc,
-                                      int channels> {
-  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32>,
-            MIMG_Mask<asm#"_V1", channels>;
-  def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
-            MIMG_Mask<asm#"_V2", channels>;
-  def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>,
-            MIMG_Mask<asm#"_V4", channels>;
-}
-
-multiclass MIMG_NoSampler <bits<7> op, string asm> {
-  defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
-  defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
-  defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
-  defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
-}
-
-class MIMG_Sampler_Helper <bits<7> op, string asm,
-                           RegisterClass dst_rc,
-                           RegisterClass src_rc, int wqm> : MIMG <
-  op,
-  (outs dst_rc:$vdata),
-  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
-       SReg_256:$srsrc, SReg_128:$ssamp),
-  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
-     #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
-  []> {
-  let mayLoad = 1;
-  let mayStore = 0;
-  let hasPostISelHook = 1;
-  let WQM = wqm;
-}
-
-multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
-                                    RegisterClass dst_rc,
-                                    int channels, int wqm> {
-  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm>,
-            MIMG_Mask<asm#"_V1", channels>;
-  def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>,
-            MIMG_Mask<asm#"_V2", channels>;
-  def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>,
-            MIMG_Mask<asm#"_V4", channels>;
-  def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>,
-            MIMG_Mask<asm#"_V8", channels>;
-  def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>,
-            MIMG_Mask<asm#"_V16", channels>;
-}
-
-multiclass MIMG_Sampler <bits<7> op, string asm> {
-  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 0>;
-  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 0>;
-  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 0>;
-  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 0>;
-}
-
-multiclass MIMG_Sampler_WQM <bits<7> op, string asm> {
-  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 1>;
-  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 1>;
-  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 1>;
-  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 1>;
-}
-
-class MIMG_Gather_Helper <bits<7> op, string asm,
-                          RegisterClass dst_rc,
-                          RegisterClass src_rc, int wqm> : MIMG <
-  op,
-  (outs dst_rc:$vdata),
-  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
-       SReg_256:$srsrc, SReg_128:$ssamp),
-  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
-     #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
-  []> {
-  let mayLoad = 1;
-  let mayStore = 0;
-
-  // DMASK was repurposed for GATHER4. 4 components are always
-  // returned and DMASK works like a swizzle - it selects
-  // the component to fetch. The only useful DMASK values are
-  // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
-  // (red,red,red,red) etc.) The ISA document doesn't mention
-  // this.
-  // Therefore, disable all code which updates DMASK by setting these two:
-  let MIMG = 0;
-  let hasPostISelHook = 0;
-  let WQM = wqm;
-}
-
-multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
-                                    RegisterClass dst_rc,
-                                    int channels, int wqm> {
-  def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>,
-            MIMG_Mask<asm#"_V1", channels>;
-  def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>,
-            MIMG_Mask<asm#"_V2", channels>;
-  def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>,
-            MIMG_Mask<asm#"_V4", channels>;
-  def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>,
-            MIMG_Mask<asm#"_V8", channels>;
-  def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>,
-            MIMG_Mask<asm#"_V16", channels>;
-}
-
-multiclass MIMG_Gather <bits<7> op, string asm> {
-  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 0>;
-  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 0>;
-  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 0>;
-  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 0>;
-}
-
-multiclass MIMG_Gather_WQM <bits<7> op, string asm> {
-  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 1>;
-  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 1>;
-  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 1>;
-  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 1>;
-}
-
-//===----------------------------------------------------------------------===//
-// Vector instruction mappings
-//===----------------------------------------------------------------------===//
-
-// Maps an opcode in e32 form to its e64 equivalent
-def getVOPe64 : InstrMapping {
-  let FilterClass = "VOP";
-  let RowFields = ["OpName"];
-  let ColFields = ["Size"];
-  let KeyCol = ["4"];
-  let ValueCols = [["8"]];
-}
-
-// Maps an opcode in e64 form to its e32 equivalent
-def getVOPe32 : InstrMapping {
-  let FilterClass = "VOP";
-  let RowFields = ["OpName"];
-  let ColFields = ["Size"];
-  let KeyCol = ["8"];
-  let ValueCols = [["4"]];
-}
-
-def getMaskedMIMGOp : InstrMapping {
-  let FilterClass = "MIMG_Mask";
-  let RowFields = ["Op"];
-  let ColFields = ["Channels"];
-  let KeyCol = ["4"];
-  let ValueCols = [["1"], ["2"], ["3"] ];
-}
-
-// Maps an commuted opcode to its original version
-def getCommuteOrig : InstrMapping {
-  let FilterClass = "VOP2_REV";
-  let RowFields = ["RevOp"];
-  let ColFields = ["IsOrig"];
-  let KeyCol = ["0"];
-  let ValueCols = [["1"]];
-}
-
-// Maps an original opcode to its commuted version
-def getCommuteRev : InstrMapping {
-  let FilterClass = "VOP2_REV";
-  let RowFields = ["RevOp"];
-  let ColFields = ["IsOrig"];
-  let KeyCol = ["1"];
-  let ValueCols = [["0"]];
-}
-
-def getCommuteCmpOrig : InstrMapping {
-  let FilterClass = "VOP2_REV";
-  let RowFields = ["RevOp"];
-  let ColFields = ["IsOrig"];
-  let KeyCol = ["0"];
-  let ValueCols = [["1"]];
-}
-
-// Maps an original opcode to its commuted version
-def getCommuteCmpRev : InstrMapping {
-  let FilterClass = "VOP2_REV";
-  let RowFields = ["RevOp"];
-  let ColFields = ["IsOrig"];
-  let KeyCol = ["1"];
-  let ValueCols = [["0"]];
-}
-
-
-def getMCOpcodeGen : InstrMapping {
-  let FilterClass = "SIMCInstr";
-  let RowFields = ["PseudoInstr"];
-  let ColFields = ["Subtarget"];
-  let KeyCol = [!cast<string>(SISubtarget.NONE)];
-  let ValueCols = [[!cast<string>(SISubtarget.SI)],[!cast<string>(SISubtarget.VI)]];
-}
-
-def getAddr64Inst : InstrMapping {
-  let FilterClass = "MUBUFAddr64Table";
-  let RowFields = ["OpName"];
-  let ColFields = ["IsAddr64"];
-  let KeyCol = ["0"];
-  let ValueCols = [["1"]];
-}
-
-// Maps an atomic opcode to its version with a return value.
-def getAtomicRetOp : InstrMapping {
-  let FilterClass = "AtomicNoRet";
-  let RowFields = ["NoRetOp"];
-  let ColFields = ["IsRet"];
-  let KeyCol = ["0"];
-  let ValueCols = [["1"]];
-}
-
-// Maps an atomic opcode to its returnless version.
-def getAtomicNoRetOp : InstrMapping {
-  let FilterClass = "AtomicNoRet";
-  let RowFields = ["NoRetOp"];
-  let ColFields = ["IsRet"];
-  let KeyCol = ["1"];
-  let ValueCols = [["0"]];
-}
-
-include "SIInstructions.td"
-include "CIInstructions.td"
-include "VIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
deleted file mode 100644
index 8c8d836776d..00000000000
--- a/lib/Target/R600/SIInstructions.td
+++ /dev/null
@@ -1,3327 +0,0 @@
-//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// This file was originally auto-generated from a GPU register header file and
-// all the instruction definitions were originally commented out.  Instructions
-// that are not yet supported remain commented out.
-//===----------------------------------------------------------------------===//
-
-class InterpSlots {
-int P0 = 2;
-int P10 = 0;
-int P20 = 1;
-}
-def INTERP : InterpSlots;
-
-def InterpSlot : Operand<i32> {
-  let PrintMethod = "printInterpSlot";
-}
-
-def SendMsgImm : Operand<i32> {
-  let PrintMethod = "printSendMsg";
-}
-
-def isGCN : Predicate<"Subtarget->getGeneration() "
-                      ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">,
-            AssemblerPredicate<"FeatureGCN">;
-def isSI : Predicate<"Subtarget->getGeneration() "
-                      "== AMDGPUSubtarget::SOUTHERN_ISLANDS">;
-
-def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
-def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
-
-def SWaitMatchClass : AsmOperandClass {
-  let Name = "SWaitCnt";
-  let RenderMethod = "addImmOperands";
-  let ParserMethod = "parseSWaitCntOps";
-}
-
-def WAIT_FLAG : InstFlag<"printWaitFlag"> {
-  let ParserMatchClass = SWaitMatchClass;
-}
-
-let SubtargetPredicate = isGCN in {
-
-//===----------------------------------------------------------------------===//
-// EXP Instructions
-//===----------------------------------------------------------------------===//
-
-defm EXP : EXP_m;
-
-//===----------------------------------------------------------------------===//
-// SMRD Instructions
-//===----------------------------------------------------------------------===//
-
-let mayLoad = 1 in {
-
-// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
-// SMRD instructions, because the SGPR_32 register class does not include M0
-// and writing to M0 from an SMRD instruction will hang the GPU.
-defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>;
-defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>;
-defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>;
-
-defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
-  0x08, "s_buffer_load_dword", SReg_128, SGPR_32
->;
-
-defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
-  0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64
->;
-
-defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
-  0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128
->;
-
-defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
-  0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256
->;
-
-defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
-  0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512
->;
-
-} // mayLoad = 1
-
-//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>;
-//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>;
-
-//===----------------------------------------------------------------------===//
-// SOP1 Instructions
-//===----------------------------------------------------------------------===//
-
-let isMoveImm = 1 in {
-  let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-    defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>;
-    defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>;
-  } // let isRematerializeable = 1
-
-  let Uses = [SCC] in {
-    defm S_CMOV_B32 : SOP1_32 <sop1<0x05, 0x02>, "s_cmov_b32", []>;
-    defm S_CMOV_B64 : SOP1_64 <sop1<0x06, 0x03>, "s_cmov_b64", []>;
-  } // End Uses = [SCC]
-} // End isMoveImm = 1
-
-let Defs = [SCC] in {
-  defm S_NOT_B32 : SOP1_32 <sop1<0x07, 0x04>, "s_not_b32",
-    [(set i32:$dst, (not i32:$src0))]
-  >;
-
-  defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64",
-    [(set i64:$dst, (not i64:$src0))]
-  >;
-  defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>;
-  defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>;
-} // End Defs = [SCC]
-
-
-defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",
-  [(set i32:$dst, (AMDGPUbrev i32:$src0))]
->;
-defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;
-
-let Defs = [SCC] in {
-  defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>;
-  defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>;
-  defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32",
-    [(set i32:$dst, (ctpop i32:$src0))]
-  >;
-  defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>;
-} // End Defs = [SCC]
-
-defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>;
-defm S_FF0_I32_B64 : SOP1_32_64 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>;
-defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32",
-  [(set i32:$dst, (cttz_zero_undef i32:$src0))]
->;
-defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>;
-
-defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32",
-  [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
->;
-
-defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>;
-defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32",
-  [(set i32:$dst, (int_AMDGPU_flbit_i32 i32:$src0))]
->;
-defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>;
-defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8",
-  [(set i32:$dst, (sext_inreg i32:$src0, i8))]
->;
-defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16",
-  [(set i32:$dst, (sext_inreg i32:$src0, i16))]
->;
-
-defm S_BITSET0_B32 : SOP1_32 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>;
-defm S_BITSET0_B64 : SOP1_64 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>;
-defm S_BITSET1_B32 : SOP1_32 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>;
-defm S_BITSET1_B64 : SOP1_64 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>;
-defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>;
-defm S_SETPC_B64 : SOP1_64 <sop1<0x20, 0x1d>, "s_setpc_b64", []>;
-defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>;
-defm S_RFE_B64 : SOP1_64 <sop1<0x22, 0x1f>, "s_rfe_b64", []>;
-
-let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
-
-defm S_AND_SAVEEXEC_B64 : SOP1_64 <sop1<0x24, 0x20>, "s_and_saveexec_b64", []>;
-defm S_OR_SAVEEXEC_B64 : SOP1_64 <sop1<0x25, 0x21>, "s_or_saveexec_b64", []>;
-defm S_XOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x26, 0x22>, "s_xor_saveexec_b64", []>;
-defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x27, 0x23>, "s_andn2_saveexec_b64", []>;
-defm S_ORN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x28, 0x24>, "s_orn2_saveexec_b64", []>;
-defm S_NAND_SAVEEXEC_B64 : SOP1_64 <sop1<0x29, 0x25>, "s_nand_saveexec_b64", []>;
-defm S_NOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2a, 0x26>, "s_nor_saveexec_b64", []>;
-defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>;
-
-} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
-
-defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>;
-defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>;
-defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>;
-defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>;
-defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>;
-defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>;
-defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>;
-defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>;
-let Defs = [SCC] in {
-  defm S_ABS_I32 : SOP1_32 <sop1<0x34, 0x30>, "s_abs_i32", []>;
-} // End Defs = [SCC]
-defm S_MOV_FED_B32 : SOP1_32 <sop1<0x35, 0x31>, "s_mov_fed_b32", []>;
-
-//===----------------------------------------------------------------------===//
-// SOP2 Instructions
-//===----------------------------------------------------------------------===//
-
-let Defs = [SCC] in { // Carry out goes to SCC
-let isCommutable = 1 in {
-defm S_ADD_U32 : SOP2_32 <sop2<0x00>, "s_add_u32", []>;
-defm S_ADD_I32 : SOP2_32 <sop2<0x02>, "s_add_i32",
-  [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
->;
-} // End isCommutable = 1
-
-defm S_SUB_U32 : SOP2_32 <sop2<0x01>, "s_sub_u32", []>;
-defm S_SUB_I32 : SOP2_32 <sop2<0x03>, "s_sub_i32",
-  [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
->;
-
-let Uses = [SCC] in { // Carry in comes from SCC
-let isCommutable = 1 in {
-defm S_ADDC_U32 : SOP2_32 <sop2<0x04>, "s_addc_u32",
-  [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
-} // End isCommutable = 1
-
-defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32",
-  [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
-} // End Uses = [SCC]
-
-defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32",
-  [(set i32:$dst, (smin i32:$src0, i32:$src1))]
->;
-defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32",
-  [(set i32:$dst, (umin i32:$src0, i32:$src1))]
->;
-defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32",
-  [(set i32:$dst, (smax i32:$src0, i32:$src1))]
->;
-defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32",
-  [(set i32:$dst, (umax i32:$src0, i32:$src1))]
->;
-} // End Defs = [SCC]
-
-
-let Uses = [SCC] in {
-  defm S_CSELECT_B32 : SOP2_32 <sop2<0x0a>, "s_cselect_b32", []>;
-  defm S_CSELECT_B64 : SOP2_64 <sop2<0x0b>, "s_cselect_b64", []>;
-} // End Uses = [SCC]
-
-let Defs = [SCC] in {
-defm S_AND_B32 : SOP2_32 <sop2<0x0e, 0x0c>, "s_and_b32",
-  [(set i32:$dst, (and i32:$src0, i32:$src1))]
->;
-
-defm S_AND_B64 : SOP2_64 <sop2<0x0f, 0x0d>, "s_and_b64",
-  [(set i64:$dst, (and i64:$src0, i64:$src1))]
->;
-
-defm S_OR_B32 : SOP2_32 <sop2<0x10, 0x0e>, "s_or_b32",
-  [(set i32:$dst, (or i32:$src0, i32:$src1))]
->;
-
-defm S_OR_B64 : SOP2_64 <sop2<0x11, 0x0f>, "s_or_b64",
-  [(set i64:$dst, (or i64:$src0, i64:$src1))]
->;
-
-defm S_XOR_B32 : SOP2_32 <sop2<0x12, 0x10>, "s_xor_b32",
-  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
->;
-
-defm S_XOR_B64 : SOP2_64 <sop2<0x13, 0x11>, "s_xor_b64",
-  [(set i64:$dst, (xor i64:$src0, i64:$src1))]
->;
-defm S_ANDN2_B32 : SOP2_32 <sop2<0x14, 0x12>, "s_andn2_b32", []>;
-defm S_ANDN2_B64 : SOP2_64 <sop2<0x15, 0x13>, "s_andn2_b64", []>;
-defm S_ORN2_B32 : SOP2_32 <sop2<0x16, 0x14>, "s_orn2_b32", []>;
-defm S_ORN2_B64 : SOP2_64 <sop2<0x17, 0x15>, "s_orn2_b64", []>;
-defm S_NAND_B32 : SOP2_32 <sop2<0x18, 0x16>, "s_nand_b32", []>;
-defm S_NAND_B64 : SOP2_64 <sop2<0x19, 0x17>, "s_nand_b64", []>;
-defm S_NOR_B32 : SOP2_32 <sop2<0x1a, 0x18>, "s_nor_b32", []>;
-defm S_NOR_B64 : SOP2_64 <sop2<0x1b, 0x19>, "s_nor_b64", []>;
-defm S_XNOR_B32 : SOP2_32 <sop2<0x1c, 0x1a>, "s_xnor_b32", []>;
-defm S_XNOR_B64 : SOP2_64 <sop2<0x1d, 0x1b>, "s_xnor_b64", []>;
-} // End Defs = [SCC]
-
-// Use added complexity so these patterns are preferred to the VALU patterns.
-let AddedComplexity = 1 in {
-let Defs = [SCC] in {
-
-defm S_LSHL_B32 : SOP2_32 <sop2<0x1e, 0x1c>, "s_lshl_b32",
-  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
->;
-defm S_LSHL_B64 : SOP2_64_32 <sop2<0x1f, 0x1d>, "s_lshl_b64",
-  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
->;
-defm S_LSHR_B32 : SOP2_32 <sop2<0x20, 0x1e>, "s_lshr_b32",
-  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
->;
-defm S_LSHR_B64 : SOP2_64_32 <sop2<0x21, 0x1f>, "s_lshr_b64",
-  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
->;
-defm S_ASHR_I32 : SOP2_32 <sop2<0x22, 0x20>, "s_ashr_i32",
-  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
->;
-defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64",
-  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
->;
-} // End Defs = [SCC]
-
-defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32",
-  [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
-defm S_BFM_B64 : SOP2_64 <sop2<0x25, 0x23>, "s_bfm_b64", []>;
-defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32",
-  [(set i32:$dst, (mul i32:$src0, i32:$src1))]
->;
-
-} // End AddedComplexity = 1
-
-let Defs = [SCC] in {
-defm S_BFE_U32 : SOP2_32 <sop2<0x27, 0x25>, "s_bfe_u32", []>;
-defm S_BFE_I32 : SOP2_32 <sop2<0x28, 0x26>, "s_bfe_i32", []>;
-defm S_BFE_U64 : SOP2_64 <sop2<0x29, 0x27>, "s_bfe_u64", []>;
-defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>;
-} // End Defs = [SCC]
-
-let sdst = 0 in {
-defm S_CBRANCH_G_FORK : SOP2_m <
-  sop2<0x2b, 0x29>, "s_cbranch_g_fork", (outs),
-  (ins SReg_64:$src0, SReg_64:$src1), "s_cbranch_g_fork $src0, $src1", []
->;
-}
-
-let Defs = [SCC] in {
-defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>;
-} // End Defs = [SCC]
-
-//===----------------------------------------------------------------------===//
-// SOPC Instructions
-//===----------------------------------------------------------------------===//
-
-def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">;
-def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">;
-def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">;
-def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">;
-def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">;
-def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">;
-def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">;
-def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">;
-def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">;
-def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">;
-def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">;
-def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">;
-////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>;
-////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>;
-////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>;
-////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>;
-//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>;
-
-//===----------------------------------------------------------------------===//
-// SOPK Instructions
-//===----------------------------------------------------------------------===//
-
-let isReMaterializable = 1 in {
-defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>;
-} // End isReMaterializable = 1
-let Uses = [SCC] in {
-  defm S_CMOVK_I32 : SOPK_32 <sopk<0x02, 0x01>, "s_cmovk_i32", []>;
-}
-
-let isCompare = 1 in {
-
-/*
-This instruction is disabled for now until we can figure out how to teach
-the instruction selector to correctly use the  S_CMP* vs V_CMP*
-instructions.
-
-When this instruction is enabled the code generator sometimes produces this
-invalid sequence:
-
-SCC = S_CMPK_EQ_I32 SGPR0, imm
-VCC = COPY SCC
-VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
-
-defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32",
-  [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
->;
-*/
-
-defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32", []>;
-defm S_CMPK_LG_I32 : SOPK_SCC <sopk<0x04, 0x03>, "s_cmpk_lg_i32", []>;
-defm S_CMPK_GT_I32 : SOPK_SCC <sopk<0x05, 0x04>, "s_cmpk_gt_i32", []>;
-defm S_CMPK_GE_I32 : SOPK_SCC <sopk<0x06, 0x05>, "s_cmpk_ge_i32", []>;
-defm S_CMPK_LT_I32 : SOPK_SCC <sopk<0x07, 0x06>, "s_cmpk_lt_i32", []>;
-defm S_CMPK_LE_I32 : SOPK_SCC <sopk<0x08, 0x07>, "s_cmpk_le_i32", []>;
-defm S_CMPK_EQ_U32 : SOPK_SCC <sopk<0x09, 0x08>, "s_cmpk_eq_u32", []>;
-defm S_CMPK_LG_U32 : SOPK_SCC <sopk<0x0a, 0x09>, "s_cmpk_lg_u32", []>;
-defm S_CMPK_GT_U32 : SOPK_SCC <sopk<0x0b, 0x0a>, "s_cmpk_gt_u32", []>;
-defm S_CMPK_GE_U32 : SOPK_SCC <sopk<0x0c, 0x0b>, "s_cmpk_ge_u32", []>;
-defm S_CMPK_LT_U32 : SOPK_SCC <sopk<0x0d, 0x0c>, "s_cmpk_lt_u32", []>;
-defm S_CMPK_LE_U32 : SOPK_SCC <sopk<0x0e, 0x0d>, "s_cmpk_le_u32", []>;
-} // End isCompare = 1
-
-let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0",
-    Constraints = "$sdst = $src0" in {
-  defm S_ADDK_I32 : SOPK_32TIE <sopk<0x0f, 0x0e>, "s_addk_i32", []>;
-  defm S_MULK_I32 : SOPK_32TIE <sopk<0x10, 0x0f>, "s_mulk_i32", []>;
-}
-
-defm S_CBRANCH_I_FORK : SOPK_m <
-  sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs),
-  (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16"
->;
-defm S_GETREG_B32 : SOPK_32 <sopk<0x12, 0x11>, "s_getreg_b32", []>;
-defm S_SETREG_B32 : SOPK_m <
-  sopk<0x13, 0x12>, "s_setreg_b32", (outs),
-  (ins SReg_32:$sdst, u16imm:$simm16), " $sdst, $simm16"
->;
-// FIXME: Not on SI?
-//defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>;
-defm S_SETREG_IMM32_B32 : SOPK_IMM32 <
-  sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs),
-  (ins i32imm:$imm, u16imm:$simm16), " $imm, $simm16"
->;
-
-//===----------------------------------------------------------------------===//
-// SOPP Instructions
-//===----------------------------------------------------------------------===//
-
-def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
-
-let isTerminator = 1 in {
-
-def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
-  [(IL_retflag)]> {
-  let simm16 = 0;
-  let isBarrier = 1;
-  let hasCtrlDep = 1;
-}
-
-let isBranch = 1 in {
-def S_BRANCH : SOPP <
-  0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
-  [(br bb:$simm16)]> {
-  let isBarrier = 1;
-}
-
-let DisableEncoding = "$scc" in {
-def S_CBRANCH_SCC0 : SOPP <
-  0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc),
-  "s_cbranch_scc0 $simm16"
->;
-def S_CBRANCH_SCC1 : SOPP <
-  0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc),
-  "s_cbranch_scc1 $simm16"
->;
-} // End DisableEncoding = "$scc"
-
-def S_CBRANCH_VCCZ : SOPP <
-  0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
-  "s_cbranch_vccz $simm16"
->;
-def S_CBRANCH_VCCNZ : SOPP <
-  0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
-  "s_cbranch_vccnz $simm16"
->;
-
-let DisableEncoding = "$exec" in {
-def S_CBRANCH_EXECZ : SOPP <
-  0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec),
-  "s_cbranch_execz $simm16"
->;
-def S_CBRANCH_EXECNZ : SOPP <
-  0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec),
-  "s_cbranch_execnz $simm16"
->;
-} // End DisableEncoding = "$exec"
-
-
-} // End isBranch = 1
-} // End isTerminator = 1
-
-let hasSideEffects = 1 in {
-def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
-  [(int_AMDGPU_barrier_local)]
-> {
-  let simm16 = 0;
-  let isBarrier = 1;
-  let hasCtrlDep = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-}
-
-def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
-def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
-def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">;
-def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">;
-
-let Uses = [EXEC, M0] in {
-  def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
-      [(AMDGPUsendmsg (i32 imm:$simm16))]
-  >;
-} // End Uses = [EXEC, M0]
-
-def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">;
-def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">;
-def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
-	let simm16 = 0;
-}
-def S_INCPERFLEVEL : SOPP <0x00000014, (ins i16imm:$simm16), "s_incperflevel $simm16">;
-def S_DECPERFLEVEL : SOPP <0x00000015, (ins i16imm:$simm16), "s_decperflevel $simm16">;
-def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
-  let simm16 = 0;
-}
-} // End hasSideEffects
-
-//===----------------------------------------------------------------------===//
-// VOPC Instructions
-//===----------------------------------------------------------------------===//
-
-let isCompare = 1, isCommutable = 1 in {
-
-defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0, 0x40>, "v_cmp_f_f32">;
-defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">;
-defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2, 0x42>, "v_cmp_eq_f32", COND_OEQ>;
-defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">;
-defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4, 0x44>, "v_cmp_gt_f32", COND_OGT>;
-defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5, 0x45>, "v_cmp_lg_f32", COND_ONE>;
-defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6, 0x46>, "v_cmp_ge_f32", COND_OGE>;
-defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7, 0x47>, "v_cmp_o_f32", COND_O>;
-defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8, 0x48>, "v_cmp_u_f32", COND_UO>;
-defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32",  COND_ULT, "v_cmp_nle_f32">;
-defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa, 0x4a>, "v_cmp_nlg_f32", COND_UEQ>;
-defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">;
-defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc, 0x4c>, "v_cmp_nle_f32", COND_UGT>;
-defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd, 0x4d>, "v_cmp_neq_f32", COND_UNE>;
-defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe, 0x4e>, "v_cmp_nlt_f32", COND_UGE>;
-defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf, 0x4f>, "v_cmp_tru_f32">;
-
-
-defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10, 0x50>, "v_cmpx_f_f32">;
-defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32", "v_cmpx_gt_f32">;
-defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12, 0x52>, "v_cmpx_eq_f32">;
-defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32", "v_cmpx_ge_f32">;
-defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14, 0x54>, "v_cmpx_gt_f32">;
-defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15, 0x55>, "v_cmpx_lg_f32">;
-defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16, 0x56>, "v_cmpx_ge_f32">;
-defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17, 0x57>, "v_cmpx_o_f32">;
-defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18, 0x58>, "v_cmpx_u_f32">;
-defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19, 0x59>, "v_cmpx_nge_f32">;
-defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a, 0x5a>, "v_cmpx_nlg_f32">;
-defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b, 0x5b>, "v_cmpx_ngt_f32">;
-defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c, 0x5c>, "v_cmpx_nle_f32">;
-defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d, 0x5d>, "v_cmpx_neq_f32">;
-defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e, 0x5e>, "v_cmpx_nlt_f32">;
-defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f, 0x5f>, "v_cmpx_tru_f32">;
-
-
-defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20, 0x60>, "v_cmp_f_f64">;
-defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">;
-defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22, 0x62>, "v_cmp_eq_f64", COND_OEQ>;
-defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">;
-defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24, 0x64>, "v_cmp_gt_f64", COND_OGT>;
-defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25, 0x65>, "v_cmp_lg_f64", COND_ONE>;
-defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26, 0x66>, "v_cmp_ge_f64", COND_OGE>;
-defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27, 0x67>, "v_cmp_o_f64", COND_O>;
-defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28, 0x68>, "v_cmp_u_f64", COND_UO>;
-defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">;
-defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a, 0x6a>, "v_cmp_nlg_f64", COND_UEQ>;
-defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">;
-defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c, 0x6c>, "v_cmp_nle_f64", COND_UGT>;
-defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d, 0x6d>, "v_cmp_neq_f64", COND_UNE>;
-defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e, 0x6e>, "v_cmp_nlt_f64", COND_UGE>;
-defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f, 0x6f>, "v_cmp_tru_f64">;
-
-
-defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30, 0x70>, "v_cmpx_f_f64">;
-defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64", "v_cmpx_gt_f64">;
-defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32, 0x72>, "v_cmpx_eq_f64">;
-defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64", "v_cmpx_ge_f64">;
-defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34, 0x74>, "v_cmpx_gt_f64">;
-defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35, 0x75>, "v_cmpx_lg_f64">;
-defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36, 0x76>, "v_cmpx_ge_f64">;
-defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37, 0x77>, "v_cmpx_o_f64">;
-defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38, 0x78>, "v_cmpx_u_f64">;
-defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64", "v_cmpx_nle_f64">;
-defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a, 0x7a>, "v_cmpx_nlg_f64">;
-defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64", "v_cmpx_nlt_f64">;
-defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c, 0x7c>, "v_cmpx_nle_f64">;
-defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d, 0x7d>, "v_cmpx_neq_f64">;
-defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e, 0x7e>, "v_cmpx_nlt_f64">;
-defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f, 0x7f>, "v_cmpx_tru_f64">;
-
-
-let SubtargetPredicate = isSICI in {
-
-defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">;
-defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">;
-defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">;
-defm V_CMPS_LE_F32 : VOPC_F32 <vopc<0x43>, "v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">;
-defm V_CMPS_GT_F32 : VOPC_F32 <vopc<0x44>, "v_cmps_gt_f32">;
-defm V_CMPS_LG_F32 : VOPC_F32 <vopc<0x45>, "v_cmps_lg_f32">;
-defm V_CMPS_GE_F32 : VOPC_F32 <vopc<0x46>, "v_cmps_ge_f32">;
-defm V_CMPS_O_F32 : VOPC_F32 <vopc<0x47>, "v_cmps_o_f32">;
-defm V_CMPS_U_F32 : VOPC_F32 <vopc<0x48>, "v_cmps_u_f32">;
-defm V_CMPS_NGE_F32 : VOPC_F32 <vopc<0x49>, "v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">;
-defm V_CMPS_NLG_F32 : VOPC_F32 <vopc<0x4a>, "v_cmps_nlg_f32">;
-defm V_CMPS_NGT_F32 : VOPC_F32 <vopc<0x4b>, "v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">;
-defm V_CMPS_NLE_F32 : VOPC_F32 <vopc<0x4c>, "v_cmps_nle_f32">;
-defm V_CMPS_NEQ_F32 : VOPC_F32 <vopc<0x4d>, "v_cmps_neq_f32">;
-defm V_CMPS_NLT_F32 : VOPC_F32 <vopc<0x4e>, "v_cmps_nlt_f32">;
-defm V_CMPS_TRU_F32 : VOPC_F32 <vopc<0x4f>, "v_cmps_tru_f32">;
-
-
-defm V_CMPSX_F_F32 : VOPCX_F32 <vopc<0x50>, "v_cmpsx_f_f32">;
-defm V_CMPSX_LT_F32 : VOPCX_F32 <vopc<0x51>, "v_cmpsx_lt_f32", "v_cmpsx_gt_f32">;
-defm V_CMPSX_EQ_F32 : VOPCX_F32 <vopc<0x52>, "v_cmpsx_eq_f32">;
-defm V_CMPSX_LE_F32 : VOPCX_F32 <vopc<0x53>, "v_cmpsx_le_f32", "v_cmpsx_ge_f32">;
-defm V_CMPSX_GT_F32 : VOPCX_F32 <vopc<0x54>, "v_cmpsx_gt_f32">;
-defm V_CMPSX_LG_F32 : VOPCX_F32 <vopc<0x55>, "v_cmpsx_lg_f32">;
-defm V_CMPSX_GE_F32 : VOPCX_F32 <vopc<0x56>, "v_cmpsx_ge_f32">;
-defm V_CMPSX_O_F32 : VOPCX_F32 <vopc<0x57>, "v_cmpsx_o_f32">;
-defm V_CMPSX_U_F32 : VOPCX_F32 <vopc<0x58>, "v_cmpsx_u_f32">;
-defm V_CMPSX_NGE_F32 : VOPCX_F32 <vopc<0x59>, "v_cmpsx_nge_f32", "v_cmpsx_nle_f32">;
-defm V_CMPSX_NLG_F32 : VOPCX_F32 <vopc<0x5a>, "v_cmpsx_nlg_f32">;
-defm V_CMPSX_NGT_F32 : VOPCX_F32 <vopc<0x5b>, "v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">;
-defm V_CMPSX_NLE_F32 : VOPCX_F32 <vopc<0x5c>, "v_cmpsx_nle_f32">;
-defm V_CMPSX_NEQ_F32 : VOPCX_F32 <vopc<0x5d>, "v_cmpsx_neq_f32">;
-defm V_CMPSX_NLT_F32 : VOPCX_F32 <vopc<0x5e>, "v_cmpsx_nlt_f32">;
-defm V_CMPSX_TRU_F32 : VOPCX_F32 <vopc<0x5f>, "v_cmpsx_tru_f32">;
-
-
-defm V_CMPS_F_F64 : VOPC_F64 <vopc<0x60>, "v_cmps_f_f64">;
-defm V_CMPS_LT_F64 : VOPC_F64 <vopc<0x61>, "v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">;
-defm V_CMPS_EQ_F64 : VOPC_F64 <vopc<0x62>, "v_cmps_eq_f64">;
-defm V_CMPS_LE_F64 : VOPC_F64 <vopc<0x63>, "v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">;
-defm V_CMPS_GT_F64 : VOPC_F64 <vopc<0x64>, "v_cmps_gt_f64">;
-defm V_CMPS_LG_F64 : VOPC_F64 <vopc<0x65>, "v_cmps_lg_f64">;
-defm V_CMPS_GE_F64 : VOPC_F64 <vopc<0x66>, "v_cmps_ge_f64">;
-defm V_CMPS_O_F64 : VOPC_F64 <vopc<0x67>, "v_cmps_o_f64">;
-defm V_CMPS_U_F64 : VOPC_F64 <vopc<0x68>, "v_cmps_u_f64">;
-defm V_CMPS_NGE_F64 : VOPC_F64 <vopc<0x69>, "v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">;
-defm V_CMPS_NLG_F64 : VOPC_F64 <vopc<0x6a>, "v_cmps_nlg_f64">;
-defm V_CMPS_NGT_F64 : VOPC_F64 <vopc<0x6b>, "v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">;
-defm V_CMPS_NLE_F64 : VOPC_F64 <vopc<0x6c>, "v_cmps_nle_f64">;
-defm V_CMPS_NEQ_F64 : VOPC_F64 <vopc<0x6d>, "v_cmps_neq_f64">;
-defm V_CMPS_NLT_F64 : VOPC_F64 <vopc<0x6e>, "v_cmps_nlt_f64">;
-defm V_CMPS_TRU_F64 : VOPC_F64 <vopc<0x6f>, "v_cmps_tru_f64">;
-
-
-defm V_CMPSX_F_F64 : VOPCX_F64 <vopc<0x70>, "v_cmpsx_f_f64">;
-defm V_CMPSX_LT_F64 : VOPCX_F64 <vopc<0x71>, "v_cmpsx_lt_f64", "v_cmpsx_gt_f64">;
-defm V_CMPSX_EQ_F64 : VOPCX_F64 <vopc<0x72>, "v_cmpsx_eq_f64">;
-defm V_CMPSX_LE_F64 : VOPCX_F64 <vopc<0x73>, "v_cmpsx_le_f64", "v_cmpsx_ge_f64">;
-defm V_CMPSX_GT_F64 : VOPCX_F64 <vopc<0x74>, "v_cmpsx_gt_f64">;
-defm V_CMPSX_LG_F64 : VOPCX_F64 <vopc<0x75>, "v_cmpsx_lg_f64">;
-defm V_CMPSX_GE_F64 : VOPCX_F64 <vopc<0x76>, "v_cmpsx_ge_f64">;
-defm V_CMPSX_O_F64 : VOPCX_F64 <vopc<0x77>, "v_cmpsx_o_f64">;
-defm V_CMPSX_U_F64 : VOPCX_F64 <vopc<0x78>, "v_cmpsx_u_f64">;
-defm V_CMPSX_NGE_F64 : VOPCX_F64 <vopc<0x79>, "v_cmpsx_nge_f64", "v_cmpsx_nle_f64">;
-defm V_CMPSX_NLG_F64 : VOPCX_F64 <vopc<0x7a>, "v_cmpsx_nlg_f64">;
-defm V_CMPSX_NGT_F64 : VOPCX_F64 <vopc<0x7b>, "v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">;
-defm V_CMPSX_NLE_F64 : VOPCX_F64 <vopc<0x7c>, "v_cmpsx_nle_f64">;
-defm V_CMPSX_NEQ_F64 : VOPCX_F64 <vopc<0x7d>, "v_cmpsx_neq_f64">;
-defm V_CMPSX_NLT_F64 : VOPCX_F64 <vopc<0x7e>, "v_cmpsx_nlt_f64">;
-defm V_CMPSX_TRU_F64 : VOPCX_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">;
-
-} // End SubtargetPredicate = isSICI
-
-defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80, 0xc0>, "v_cmp_f_i32">;
-defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">;
-defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82, 0xc2>, "v_cmp_eq_i32", COND_EQ>;
-defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">;
-defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84, 0xc4>, "v_cmp_gt_i32", COND_SGT>;
-defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85, 0xc5>, "v_cmp_ne_i32", COND_NE>;
-defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86, 0xc6>, "v_cmp_ge_i32", COND_SGE>;
-defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87, 0xc7>, "v_cmp_t_i32">;
-
-
-defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90, 0xd0>, "v_cmpx_f_i32">;
-defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32", "v_cmpx_gt_i32">;
-defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92, 0xd2>, "v_cmpx_eq_i32">;
-defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32", "v_cmpx_ge_i32">;
-defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94, 0xd4>, "v_cmpx_gt_i32">;
-defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95, 0xd5>, "v_cmpx_ne_i32">;
-defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96, 0xd6>, "v_cmpx_ge_i32">;
-defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97, 0xd7>, "v_cmpx_t_i32">;
-
-
-defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0, 0xe0>, "v_cmp_f_i64">;
-defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">;
-defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2, 0xe2>, "v_cmp_eq_i64", COND_EQ>;
-defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">;
-defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4, 0xe4>, "v_cmp_gt_i64", COND_SGT>;
-defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5, 0xe5>, "v_cmp_ne_i64", COND_NE>;
-defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6, 0xe6>, "v_cmp_ge_i64", COND_SGE>;
-defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7, 0xe7>, "v_cmp_t_i64">;
-
-
-defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0, 0xf0>, "v_cmpx_f_i64">;
-defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64", "v_cmpx_gt_i64">;
-defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2, 0xf2>, "v_cmpx_eq_i64">;
-defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64", "v_cmpx_ge_i64">;
-defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4, 0xf4>, "v_cmpx_gt_i64">;
-defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5, 0xf5>, "v_cmpx_ne_i64">;
-defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6, 0xf6>, "v_cmpx_ge_i64">;
-defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7, 0xf7>, "v_cmpx_t_i64">;
-
-
-defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0, 0xc8>, "v_cmp_f_u32">;
-defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">;
-defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2, 0xca>, "v_cmp_eq_u32", COND_EQ>;
-defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">;
-defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4, 0xcc>, "v_cmp_gt_u32", COND_UGT>;
-defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5, 0xcd>, "v_cmp_ne_u32", COND_NE>;
-defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6, 0xce>, "v_cmp_ge_u32", COND_UGE>;
-defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7, 0xcf>, "v_cmp_t_u32">;
-
-
-defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0, 0xd8>, "v_cmpx_f_u32">;
-defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32", "v_cmpx_gt_u32">;
-defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2, 0xda>, "v_cmpx_eq_u32">;
-defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32", "v_cmpx_le_u32">;
-defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4, 0xdc>, "v_cmpx_gt_u32">;
-defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5, 0xdd>, "v_cmpx_ne_u32">;
-defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6, 0xde>, "v_cmpx_ge_u32">;
-defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7, 0xdf>, "v_cmpx_t_u32">;
-
-
-defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0, 0xe8>, "v_cmp_f_u64">;
-defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">;
-defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2, 0xea>, "v_cmp_eq_u64", COND_EQ>;
-defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">;
-defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4, 0xec>, "v_cmp_gt_u64", COND_UGT>;
-defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5, 0xed>, "v_cmp_ne_u64", COND_NE>;
-defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6, 0xee>, "v_cmp_ge_u64", COND_UGE>;
-defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7, 0xef>, "v_cmp_t_u64">;
-
-defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0, 0xf8>, "v_cmpx_f_u64">;
-defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64", "v_cmpx_gt_u64">;
-defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2, 0xfa>, "v_cmpx_eq_u64">;
-defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64", "v_cmpx_ge_u64">;
-defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4, 0xfc>, "v_cmpx_gt_u64">;
-defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5, 0xfd>, "v_cmpx_ne_u64">;
-defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6, 0xfe>, "v_cmpx_ge_u64">;
-defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7, 0xff>, "v_cmpx_t_u64">;
-
-} // End isCompare = 1, isCommutable = 1
-
-defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <vopc<0x88, 0x10>, "v_cmp_class_f32">;
-defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <vopc<0x98, 0x11>, "v_cmpx_class_f32">;
-defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <vopc<0xa8, 0x12>, "v_cmp_class_f64">;
-defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">;
-
-//===----------------------------------------------------------------------===//
-// DS Instructions
-//===----------------------------------------------------------------------===//
-
-defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>;
-defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>;
-defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>;
-defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>;
-defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>;
-defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>;
-defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>;
-defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>;
-defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>;
-defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>;
-defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>;
-defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>;
-defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>;
-let mayLoad = 0 in {
-defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>;
-defm DS_WRITE2_B32 : DS_1A1D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>;
-defm DS_WRITE2ST64_B32 : DS_1A1D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>;
-}
-defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>;
-defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>;
-defm DS_MIN_F32 : DS_1A2D_NORET <0x12, "ds_min_f32", VGPR_32>;
-defm DS_MAX_F32 : DS_1A2D_NORET <0x13, "ds_max_f32", VGPR_32>;
-
-defm DS_GWS_INIT : DS_1A_GDS <0x19, "ds_gws_init">;
-defm DS_GWS_SEMA_V : DS_1A_GDS <0x1a, "ds_gws_sema_v">;
-defm DS_GWS_SEMA_BR : DS_1A_GDS <0x1b, "ds_gws_sema_br">;
-defm DS_GWS_SEMA_P : DS_1A_GDS <0x1c, "ds_gws_sema_p">;
-defm DS_GWS_BARRIER : DS_1A_GDS <0x1d, "ds_gws_barrier">;
-let mayLoad = 0 in {
-defm DS_WRITE_B8 : DS_1A1D_NORET <0x1e, "ds_write_b8", VGPR_32>;
-defm DS_WRITE_B16 : DS_1A1D_NORET <0x1f, "ds_write_b16", VGPR_32>;
-}
-defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">;
-defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">;
-defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">;
-defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">;
-defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">;
-defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">;
-defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">;
-defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">;
-defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">;
-defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">;
-defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">;
-defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">;
-defm DS_MSKOR_RTN_B32 : DS_1A2D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
-defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>;
-defm DS_WRXCHG2_RTN_B32 : DS_1A2D_RET <
-  0x2e, "ds_wrxchg2_rtn_b32", VReg_64, "", VGPR_32
->;
-defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET <
-  0x2f, "ds_wrxchg2st64_rtn_b32", VReg_64, "", VGPR_32
->;
-defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
-defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
-defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
-defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
-let SubtargetPredicate = isCI in {
-defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
-} // End isCI
-defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>;
-let mayStore = 0 in {
-defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>;
-defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>;
-defm DS_READ2ST64_B32 : DS_1A_Off8_RET <0x38, "ds_read2st64_b32", VReg_64>;
-defm DS_READ_I8 : DS_1A_RET <0x39, "ds_read_i8", VGPR_32>;
-defm DS_READ_U8 : DS_1A_RET <0x3a, "ds_read_u8", VGPR_32>;
-defm DS_READ_I16 : DS_1A_RET <0x3b, "ds_read_i16", VGPR_32>;
-defm DS_READ_U16 : DS_1A_RET <0x3c, "ds_read_u16", VGPR_32>;
-}
-defm DS_CONSUME : DS_0A_RET <0x3d, "ds_consume">;
-defm DS_APPEND : DS_0A_RET <0x3e, "ds_append">;
-defm DS_ORDERED_COUNT : DS_1A_RET_GDS <0x3f, "ds_ordered_count">;
-defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>;
-defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>;
-defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>;
-defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>;
-defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>;
-defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>;
-defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>;
-defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>;
-defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>;
-defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>;
-defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>;
-defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>;
-defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>;
-let mayLoad = 0 in {
-defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>;
-defm DS_WRITE2_B64 : DS_1A1D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>;
-defm DS_WRITE2ST64_B64 : DS_1A1D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>;
-}
-defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>;
-defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>;
-defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>;
-defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>;
-
-defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">;
-defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">;
-defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">;
-defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">;
-defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">;
-defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">;
-defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">;
-defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">;
-defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">;
-defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">;
-defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">;
-defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">;
-defm DS_MSKOR_RTN_B64 : DS_1A2D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">;
-defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">;
-defm DS_WRXCHG2_RTN_B64 : DS_1A2D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_128, "ds_wrxchg2_b64", VReg_64>;
-defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET <0x6f, "ds_wrxchg2st64_rtn_b64", VReg_128, "ds_wrxchg2st64_b64", VReg_64>;
-defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">;
-defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">;
-defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">;
-defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">;
-
-let mayStore = 0 in {
-defm DS_READ_B64 : DS_1A_RET <0x76, "ds_read_b64", VReg_64>;
-defm DS_READ2_B64 : DS_1A_Off8_RET <0x77, "ds_read2_b64", VReg_128>;
-defm DS_READ2ST64_B64 : DS_1A_Off8_RET <0x78, "ds_read2st64_b64", VReg_128>;
-}
-
-defm DS_ADD_SRC2_U32 : DS_1A <0x80, "ds_add_src2_u32">;
-defm DS_SUB_SRC2_U32 : DS_1A <0x81, "ds_sub_src2_u32">;
-defm DS_RSUB_SRC2_U32 : DS_1A <0x82, "ds_rsub_src2_u32">;
-defm DS_INC_SRC2_U32 : DS_1A <0x83, "ds_inc_src2_u32">;
-defm DS_DEC_SRC2_U32 : DS_1A <0x84, "ds_dec_src2_u32">;
-defm DS_MIN_SRC2_I32 : DS_1A <0x85, "ds_min_src2_i32">;
-defm DS_MAX_SRC2_I32 : DS_1A <0x86, "ds_max_src2_i32">;
-defm DS_MIN_SRC2_U32 : DS_1A <0x87, "ds_min_src2_u32">;
-defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">;
-defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">;
-defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">;
-defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">;
-defm DS_WRITE_SRC2_B32 : DS_1A <0x8c, "ds_write_src2_b32">;
-
-defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">;
-defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">;
-
-defm DS_ADD_SRC2_U64 : DS_1A <0xc0, "ds_add_src2_u64">;
-defm DS_SUB_SRC2_U64 : DS_1A <0xc1, "ds_sub_src2_u64">;
-defm DS_RSUB_SRC2_U64 : DS_1A <0xc2, "ds_rsub_src2_u64">;
-defm DS_INC_SRC2_U64 : DS_1A <0xc3, "ds_inc_src2_u64">;
-defm DS_DEC_SRC2_U64 : DS_1A <0xc4, "ds_dec_src2_u64">;
-defm DS_MIN_SRC2_I64 : DS_1A <0xc5, "ds_min_src2_i64">;
-defm DS_MAX_SRC2_I64 : DS_1A <0xc6, "ds_max_src2_i64">;
-defm DS_MIN_SRC2_U64 : DS_1A <0xc7, "ds_min_src2_u64">;
-defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">;
-defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">;
-defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">;
-defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">;
-defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">;
-
-defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">;
-defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">;
-
-//let SubtargetPredicate = isCI in {
-// DS_CONDXCHG32_RTN_B64
-// DS_CONDXCHG32_RTN_B128
-//} // End isCI
-
-//===----------------------------------------------------------------------===//
-// MUBUF Instructions
-//===----------------------------------------------------------------------===//
-
-defm BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper <
-  mubuf<0x00>, "buffer_load_format_x", VGPR_32
->;
-defm BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper <
-  mubuf<0x01>, "buffer_load_format_xy", VReg_64
->;
-defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper <
-  mubuf<0x02>, "buffer_load_format_xyz", VReg_96
->;
-defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <
-  mubuf<0x03>, "buffer_load_format_xyzw", VReg_128
->;
-defm BUFFER_STORE_FORMAT_X : MUBUF_Store_Helper <
-  mubuf<0x04>, "buffer_store_format_x", VGPR_32
->;
-defm BUFFER_STORE_FORMAT_XY : MUBUF_Store_Helper <
-  mubuf<0x05>, "buffer_store_format_xy", VReg_64
->;
-defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Store_Helper <
-  mubuf<0x06>, "buffer_store_format_xyz", VReg_96
->;
-defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper <
-  mubuf<0x07>, "buffer_store_format_xyzw", VReg_128
->;
-defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <
-  mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global
->;
-defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <
-  mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global
->;
-defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <
-  mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global
->;
-defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
-  mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global
->;
-defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
-  mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load
->;
-defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
-  mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load
->;
-defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
-  mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load
->;
-
-defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
-  mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global
->;
-
-defm BUFFER_STORE_SHORT : MUBUF_Store_Helper <
-  mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global
->;
-
-defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
-  mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store
->;
-
-defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
-  mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store
->;
-
-defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
-  mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store
->;
-
-defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic <
-  mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
->;
-//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", []>;
-defm BUFFER_ATOMIC_ADD : MUBUF_Atomic <
-  mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global
->;
-defm BUFFER_ATOMIC_SUB : MUBUF_Atomic <
-  mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global
->;
-//def BUFFER_ATOMIC_RSUB : MUBUF_ <mubuf<0x34>, "buffer_atomic_rsub", []>; // isn't on CI & VI
-defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic <
-  mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global
->;
-defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic <
-  mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global
->;
-defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic <
-  mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global
->;
-defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic <
-  mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global
->;
-defm BUFFER_ATOMIC_AND : MUBUF_Atomic <
-  mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global
->;
-defm BUFFER_ATOMIC_OR : MUBUF_Atomic <
-  mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global
->;
-defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
-  mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
->;
-//def BUFFER_ATOMIC_INC : MUBUF_ <mubuf<0x3c, 0x4b>, "buffer_atomic_inc", []>;
-//def BUFFER_ATOMIC_DEC : MUBUF_ <mubuf<0x3d, 0x4c>, "buffer_atomic_dec", []>;
-//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI
-//def BUFFER_ATOMIC_FMIN : MUBUF_ <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI
-//def BUFFER_ATOMIC_FMAX : MUBUF_ <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI
-//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", []>;
-//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", []>;
-//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <mubuf<0x52, 0x62>, "buffer_atomic_add_x2", []>;
-//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", []>;
-//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI
-//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", []>;
-//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", []>;
-//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", []>;
-//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", []>;
-//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <mubuf<0x59, 0x68>, "buffer_atomic_and_x2", []>;
-//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", []>;
-//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", []>;
-//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", []>;
-//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", []>;
-//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI
-//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI
-//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI
-//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI
-//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI
-//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>;
-
-//===----------------------------------------------------------------------===//
-// MTBUF Instructions
-//===----------------------------------------------------------------------===//
-
-//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "tbuffer_load_format_x", []>;
-//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>;
-//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>;
-defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>;
-defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>;
-defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>;
-defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>;
-defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>;
-
-//===----------------------------------------------------------------------===//
-// MIMG Instructions
-//===----------------------------------------------------------------------===//
-
-defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">;
-defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">;
-//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>;
-//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>;
-//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>;
-//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>;
-//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>;
-//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>;
-//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
-//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
-defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
-//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>;
-//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>;
-//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>;
-//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>;
-//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>;
-//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>;
-//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>;
-//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>;
-//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>;
-//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>;
-//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>;
-//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>;
-//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>;
-//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>;
-//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>;
-//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>;
-//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>;
-defm IMAGE_SAMPLE           : MIMG_Sampler_WQM <0x00000020, "image_sample">;
-defm IMAGE_SAMPLE_CL        : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
-defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, "image_sample_d">;
-defm IMAGE_SAMPLE_D_CL      : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
-defm IMAGE_SAMPLE_L         : MIMG_Sampler <0x00000024, "image_sample_l">;
-defm IMAGE_SAMPLE_B         : MIMG_Sampler_WQM <0x00000025, "image_sample_b">;
-defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">;
-defm IMAGE_SAMPLE_LZ        : MIMG_Sampler <0x00000027, "image_sample_lz">;
-defm IMAGE_SAMPLE_C         : MIMG_Sampler_WQM <0x00000028, "image_sample_c">;
-defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">;
-defm IMAGE_SAMPLE_C_D       : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
-defm IMAGE_SAMPLE_C_D_CL    : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
-defm IMAGE_SAMPLE_C_L       : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
-defm IMAGE_SAMPLE_C_B       : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">;
-defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">;
-defm IMAGE_SAMPLE_C_LZ      : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
-defm IMAGE_SAMPLE_O         : MIMG_Sampler_WQM <0x00000030, "image_sample_o">;
-defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">;
-defm IMAGE_SAMPLE_D_O       : MIMG_Sampler <0x00000032, "image_sample_d_o">;
-defm IMAGE_SAMPLE_D_CL_O    : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
-defm IMAGE_SAMPLE_L_O       : MIMG_Sampler <0x00000034, "image_sample_l_o">;
-defm IMAGE_SAMPLE_B_O       : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">;
-defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">;
-defm IMAGE_SAMPLE_LZ_O      : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
-defm IMAGE_SAMPLE_C_O       : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">;
-defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">;
-defm IMAGE_SAMPLE_C_D_O     : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
-defm IMAGE_SAMPLE_C_D_CL_O  : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
-defm IMAGE_SAMPLE_C_L_O     : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
-defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">;
-defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">;
-defm IMAGE_SAMPLE_C_LZ_O    : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
-defm IMAGE_GATHER4          : MIMG_Gather_WQM <0x00000040, "image_gather4">;
-defm IMAGE_GATHER4_CL       : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">;
-defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, "image_gather4_l">;
-defm IMAGE_GATHER4_B        : MIMG_Gather_WQM <0x00000045, "image_gather4_b">;
-defm IMAGE_GATHER4_B_CL     : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">;
-defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, "image_gather4_lz">;
-defm IMAGE_GATHER4_C        : MIMG_Gather_WQM <0x00000048, "image_gather4_c">;
-defm IMAGE_GATHER4_C_CL     : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">;
-defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
-defm IMAGE_GATHER4_C_B      : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">;
-defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">;
-defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
-defm IMAGE_GATHER4_O        : MIMG_Gather_WQM <0x00000050, "image_gather4_o">;
-defm IMAGE_GATHER4_CL_O     : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">;
-defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, "image_gather4_l_o">;
-defm IMAGE_GATHER4_B_O      : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">;
-defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
-defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
-defm IMAGE_GATHER4_C_O      : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">;
-defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">;
-defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
-defm IMAGE_GATHER4_C_B_O    : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
-defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
-defm IMAGE_GET_LOD          : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
-defm IMAGE_SAMPLE_CD        : MIMG_Sampler <0x00000068, "image_sample_cd">;
-defm IMAGE_SAMPLE_CD_CL     : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
-defm IMAGE_SAMPLE_C_CD      : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
-defm IMAGE_SAMPLE_C_CD_CL   : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">;
-defm IMAGE_SAMPLE_CD_O      : MIMG_Sampler <0x0000006c, "image_sample_cd_o">;
-defm IMAGE_SAMPLE_CD_CL_O   : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">;
-defm IMAGE_SAMPLE_C_CD_O    : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">;
-defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">;
-//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
-//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
-
-//===----------------------------------------------------------------------===//
-// VOP1 Instructions
-//===----------------------------------------------------------------------===//
-
-let vdst = 0, src0 = 0 in {
-defm V_NOP : VOP1_m <vop1<0x0>, (outs), (ins), "v_nop", [], "v_nop">;
-}
-
-let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>;
-} // End isMoveImm = 1
-
-let Uses = [EXEC] in {
-
-// FIXME: Specify SchedRW for READFIRSTLANE_B32
-
-def V_READFIRSTLANE_B32 : VOP1 <
-  0x00000002,
-  (outs SReg_32:$vdst),
-  (ins VGPR_32:$src0),
-  "v_readfirstlane_b32 $vdst, $src0",
-  []
->;
-
-}
-
-let SchedRW = [WriteQuarterRate32] in {
-
-defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64",
-  VOP_I32_F64, fp_to_sint
->;
-defm V_CVT_F64_I32 : VOP1Inst <vop1<0x4>, "v_cvt_f64_i32",
-  VOP_F64_I32, sint_to_fp
->;
-defm V_CVT_F32_I32 : VOP1Inst <vop1<0x5>, "v_cvt_f32_i32",
-  VOP_F32_I32, sint_to_fp
->;
-defm V_CVT_F32_U32 : VOP1Inst <vop1<0x6>, "v_cvt_f32_u32",
-  VOP_F32_I32, uint_to_fp
->;
-defm V_CVT_U32_F32 : VOP1Inst <vop1<0x7>, "v_cvt_u32_f32",
-  VOP_I32_F32, fp_to_uint
->;
-defm V_CVT_I32_F32 : VOP1Inst <vop1<0x8>, "v_cvt_i32_f32",
-  VOP_I32_F32, fp_to_sint
->;
-defm V_CVT_F16_F32 : VOP1Inst <vop1<0xa>, "v_cvt_f16_f32",
-  VOP_I32_F32, fp_to_f16
->;
-defm V_CVT_F32_F16 : VOP1Inst <vop1<0xb>, "v_cvt_f32_f16",
-  VOP_F32_I32, f16_to_fp
->;
-defm V_CVT_RPI_I32_F32 : VOP1Inst <vop1<0xc>, "v_cvt_rpi_i32_f32",
-  VOP_I32_F32, cvt_rpi_i32_f32>;
-defm V_CVT_FLR_I32_F32 : VOP1Inst <vop1<0xd>, "v_cvt_flr_i32_f32",
-  VOP_I32_F32, cvt_flr_i32_f32>;
-defm V_CVT_OFF_F32_I4 : VOP1Inst  <vop1<0x0e>, "v_cvt_off_f32_i4", VOP_F32_I32>;
-defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64",
-  VOP_F32_F64, fround
->;
-defm V_CVT_F64_F32 : VOP1Inst <vop1<0x10>, "v_cvt_f64_f32",
-  VOP_F64_F32, fextend
->;
-defm V_CVT_F32_UBYTE0 : VOP1Inst <vop1<0x11>, "v_cvt_f32_ubyte0",
-  VOP_F32_I32, AMDGPUcvt_f32_ubyte0
->;
-defm V_CVT_F32_UBYTE1 : VOP1Inst <vop1<0x12>, "v_cvt_f32_ubyte1",
-  VOP_F32_I32, AMDGPUcvt_f32_ubyte1
->;
-defm V_CVT_F32_UBYTE2 : VOP1Inst <vop1<0x13>, "v_cvt_f32_ubyte2",
-  VOP_F32_I32, AMDGPUcvt_f32_ubyte2
->;
-defm V_CVT_F32_UBYTE3 : VOP1Inst <vop1<0x14>, "v_cvt_f32_ubyte3",
-  VOP_F32_I32, AMDGPUcvt_f32_ubyte3
->;
-defm V_CVT_U32_F64 : VOP1Inst <vop1<0x15>, "v_cvt_u32_f64",
-  VOP_I32_F64, fp_to_uint
->;
-defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32",
-  VOP_F64_I32, uint_to_fp
->;
-
-} // let SchedRW = [WriteQuarterRate32]
-
-defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32",
-  VOP_F32_F32, AMDGPUfract
->;
-defm V_TRUNC_F32 : VOP1Inst <vop1<0x21, 0x1c>, "v_trunc_f32",
-  VOP_F32_F32, ftrunc
->;
-defm V_CEIL_F32 : VOP1Inst <vop1<0x22, 0x1d>, "v_ceil_f32",
-  VOP_F32_F32, fceil
->;
-defm V_RNDNE_F32 : VOP1Inst <vop1<0x23, 0x1e>, "v_rndne_f32",
-  VOP_F32_F32, frint
->;
-defm V_FLOOR_F32 : VOP1Inst <vop1<0x24, 0x1f>, "v_floor_f32",
-  VOP_F32_F32, ffloor
->;
-defm V_EXP_F32 : VOP1Inst <vop1<0x25, 0x20>, "v_exp_f32",
-  VOP_F32_F32, fexp2
->;
-
-let SchedRW = [WriteQuarterRate32] in {
-
-defm V_LOG_F32 : VOP1Inst <vop1<0x27, 0x21>, "v_log_f32",
-  VOP_F32_F32, flog2
->;
-defm V_RCP_F32 : VOP1Inst <vop1<0x2a, 0x22>, "v_rcp_f32",
-  VOP_F32_F32, AMDGPUrcp
->;
-defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b, 0x23>, "v_rcp_iflag_f32",
-  VOP_F32_F32
->;
-defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32",
-  VOP_F32_F32, AMDGPUrsq
->;
-
-} //let SchedRW = [WriteQuarterRate32]
-
-let SchedRW = [WriteDouble] in {
-
-defm V_RCP_F64 : VOP1Inst <vop1<0x2f, 0x25>, "v_rcp_f64",
-  VOP_F64_F64, AMDGPUrcp
->;
-defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64",
-  VOP_F64_F64, AMDGPUrsq
->;
-
-} // let SchedRW = [WriteDouble];
-
-defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32",
-  VOP_F32_F32, fsqrt
->;
-
-let SchedRW = [WriteDouble] in {
-
-defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64",
-  VOP_F64_F64, fsqrt
->;
-
-} // let SchedRW = [WriteDouble]
-
-defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
-  VOP_F32_F32, AMDGPUsin
->;
-defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32",
-  VOP_F32_F32, AMDGPUcos
->;
-defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>;
-defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>;
-defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>;
-defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>;
-defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>;
-defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64",
-  VOP_I32_F64
->;
-defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64",
-  VOP_F64_F64
->;
-defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>;
-defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32",
-  VOP_I32_F32
->;
-defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32",
-  VOP_F32_F32
->;
-let vdst = 0, src0 = 0 in {
-defm V_CLREXCP : VOP1_m <vop1<0x41,0x35>, (outs), (ins), "v_clrexcp", [],
-  "v_clrexcp"
->;
-}
-defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>;
-defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>;
-defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>;
-
-// These instruction only exist on SI and CI
-let SubtargetPredicate = isSICI in {
-
-let SchedRW = [WriteQuarterRate32] in {
-
-defm V_MOV_FED_B32 : VOP1InstSI <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>;
-defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>;
-defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>;
-defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>;
-defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32",
-  VOP_F32_F32, AMDGPUrsq_clamped
->;
-defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32",
-  VOP_F32_F32, AMDGPUrsq_legacy
->;
-
-} // End let SchedRW = [WriteQuarterRate32]
-
-let SchedRW = [WriteDouble] in {
-
-defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>;
-defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64",
-  VOP_F64_F64, AMDGPUrsq_clamped
->;
-
-} // End SchedRW = [WriteDouble]
-
-} // End SubtargetPredicate = isSICI
-
-//===----------------------------------------------------------------------===//
-// VINTRP Instructions
-//===----------------------------------------------------------------------===//
-
-let Uses = [M0] in {
-
-// FIXME: Specify SchedRW for VINTRP insturctions.
-
-multiclass V_INTERP_P1_F32_m : VINTRP_m <
-  0x00000000,
-  (outs VGPR_32:$dst),
-  (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr),
-  "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [m0]",
-  [(set f32:$dst, (AMDGPUinterp_p1 i32:$i, (i32 imm:$attr_chan),
-                                           (i32 imm:$attr)))]
->;
-
-let OtherPredicates = [has32BankLDS] in {
-
-defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
-
-} // End OtherPredicates = [has32BankLDS]
-
-let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst" in {
-
-defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
-
-} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst"
-
-let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in {
-
-defm V_INTERP_P2_F32 : VINTRP_m <
-  0x00000001,
-  (outs VGPR_32:$dst),
-  (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr),
-  "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [m0]",
-  [(set f32:$dst, (AMDGPUinterp_p2 f32:$src0, i32:$j, (i32 imm:$attr_chan),
-                                                     (i32 imm:$attr)))]>;
-
-} // End DisableEncoding = "$src0", Constraints = "$src0 = $dst"
-
-defm V_INTERP_MOV_F32 : VINTRP_m <
-  0x00000002,
-  (outs VGPR_32:$dst),
-  (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr),
-  "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [m0]",
-  [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan),
-                                    (i32 imm:$attr)))]>;
-
-} // End Uses = [M0]
-
-//===----------------------------------------------------------------------===//
-// VOP2 Instructions
-//===----------------------------------------------------------------------===//
-
-multiclass V_CNDMASK <vop2 op, string name> {
-  defm _e32 : VOP2_m <
-      op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [],
-      name, name>;
-
-  defm _e64  : VOP3_m <
-      op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64,
-      name#!cast<string>(VOP_CNDMASK.Asm64), [], name, 3>;
-}
-
-defm V_CNDMASK_B32 : V_CNDMASK<vop2<0x0>, "v_cndmask_b32">;
-
-let isCommutable = 1 in {
-defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32",
-  VOP_F32_F32_F32, fadd
->;
-
-defm V_SUB_F32 : VOP2Inst <vop2<0x4, 0x2>, "v_sub_f32", VOP_F32_F32_F32, fsub>;
-defm V_SUBREV_F32 : VOP2Inst <vop2<0x5, 0x3>, "v_subrev_f32",
-  VOP_F32_F32_F32, null_frag, "v_sub_f32"
->;
-} // End isCommutable = 1
-
-let isCommutable = 1 in {
-
-defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7, 0x4>, "v_mul_legacy_f32",
-  VOP_F32_F32_F32, int_AMDGPU_mul
->;
-
-defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32",
-  VOP_F32_F32_F32, fmul
->;
-
-defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9, 0x6>, "v_mul_i32_i24",
-  VOP_I32_I32_I32, AMDGPUmul_i24
->;
-
-defm V_MUL_HI_I32_I24 : VOP2Inst <vop2<0xa,0x7>, "v_mul_hi_i32_i24",
-  VOP_I32_I32_I32
->;
-
-defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb, 0x8>, "v_mul_u32_u24",
-  VOP_I32_I32_I32, AMDGPUmul_u24
->;
-
-defm V_MUL_HI_U32_U24 : VOP2Inst <vop2<0xc,0x9>, "v_mul_hi_u32_u24",
- VOP_I32_I32_I32
->;
-
-defm V_MIN_F32 : VOP2Inst <vop2<0xf, 0xa>, "v_min_f32", VOP_F32_F32_F32,
-  fminnum>;
-defm V_MAX_F32 : VOP2Inst <vop2<0x10, 0xb>, "v_max_f32", VOP_F32_F32_F32,
-  fmaxnum>;
-defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32>;
-defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32>;
-defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32>;
-defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32>;
-
-defm V_LSHRREV_B32 : VOP2Inst <
-  vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag,
-    "v_lshr_b32"
->;
-
-defm V_ASHRREV_I32 : VOP2Inst <
-  vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag,
-    "v_ashr_i32"
->;
-
-defm V_LSHLREV_B32 : VOP2Inst <
-  vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag,
-    "v_lshl_b32"
->;
-
-defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>;
-defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>;
-defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>;
-
-defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>;
-} // End isCommutable = 1
-
-defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">;
-
-let isCommutable = 1 in {
-defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">;
-} // End isCommutable = 1
-
-let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
-// No patterns so that the scalar instructions are always selected.
-// The scalar versions will be replaced with vector when needed later.
-
-// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
-// but the VI instructions behave the same as the SI versions.
-defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32",
-  VOP_I32_I32_I32, add
->;
-defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>;
-
-defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32",
-  VOP_I32_I32_I32, null_frag, "v_sub_i32"
->;
-
-let Uses = [VCC] in { // Carry-in comes from VCC
-defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32",
-  VOP_I32_I32_I32_VCC
->;
-defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32",
-  VOP_I32_I32_I32_VCC
->;
-defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32",
-  VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32"
->;
-
-} // End Uses = [VCC]
-} // End isCommutable = 1, Defs = [VCC]
-
-defm V_READLANE_B32 : VOP2SI_3VI_m <
-  vop3 <0x001, 0x289>,
-  "v_readlane_b32",
-  (outs SReg_32:$vdst),
-  (ins VGPR_32:$src0, SCSrc_32:$src1),
-  "v_readlane_b32 $vdst, $src0, $src1"
->;
-
-defm V_WRITELANE_B32 : VOP2SI_3VI_m <
-  vop3 <0x002, 0x28a>,
-  "v_writelane_b32",
-  (outs VGPR_32:$vdst),
-  (ins SReg_32:$src0, SCSrc_32:$src1),
-  "v_writelane_b32 $vdst, $src0, $src1"
->;
-
-// These instructions only exist on SI and CI
-let SubtargetPredicate = isSICI in {
-
-defm V_MIN_LEGACY_F32 : VOP2InstSI <vop2<0xd>, "v_min_legacy_f32",
-  VOP_F32_F32_F32, AMDGPUfmin_legacy
->;
-defm V_MAX_LEGACY_F32 : VOP2InstSI <vop2<0xe>, "v_max_legacy_f32",
-  VOP_F32_F32_F32, AMDGPUfmax_legacy
->;
-
-let isCommutable = 1 in {
-defm V_LSHR_B32 : VOP2InstSI <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32>;
-defm V_ASHR_I32 : VOP2InstSI <vop2<0x17>, "v_ashr_i32", VOP_I32_I32_I32>;
-defm V_LSHL_B32 : VOP2InstSI <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32>;
-} // End isCommutable = 1
-} // End let SubtargetPredicate = SICI
-
-let isCommutable = 1 in {
-defm V_MAC_LEGACY_F32 : VOP2_VI3_Inst <vop23<0x6, 0x28e>, "v_mac_legacy_f32",
-  VOP_F32_F32_F32
->;
-} // End isCommutable = 1
-
-defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32",
-  VOP_I32_I32_I32
->;
-defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32",
-  VOP_I32_I32_I32
->;
-defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32",
-  VOP_I32_I32_I32
->;
-defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32",
-  VOP_I32_I32_I32
->;
-defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32",
-  VOP_F32_F32_I32, AMDGPUldexp
->;
-
-defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst <vop23<0x2c, 0x1f0>, "v_cvt_pkaccum_u8_f32",
-  VOP_I32_F32_I32>; // TODO: set "Uses = dst"
-
-defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst <vop23<0x2d, 0x294>, "v_cvt_pknorm_i16_f32",
-  VOP_I32_F32_F32
->;
-defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst <vop23<0x2e, 0x295>, "v_cvt_pknorm_u16_f32",
-  VOP_I32_F32_F32
->;
-defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst <vop23<0x2f, 0x296>, "v_cvt_pkrtz_f16_f32",
-  VOP_I32_F32_F32, int_SI_packf16
->;
-defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst <vop23<0x30, 0x297>, "v_cvt_pk_u16_u32",
-  VOP_I32_I32_I32
->;
-defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst <vop23<0x31, 0x298>, "v_cvt_pk_i16_i32",
-  VOP_I32_I32_I32
->;
-
-//===----------------------------------------------------------------------===//
-// VOP3 Instructions
-//===----------------------------------------------------------------------===//
-
-let isCommutable = 1 in {
-defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140, 0x1c0>, "v_mad_legacy_f32",
-  VOP_F32_F32_F32_F32
->;
-
-defm V_MAD_F32 : VOP3Inst <vop3<0x141, 0x1c1>, "v_mad_f32",
-  VOP_F32_F32_F32_F32, fmad
->;
-
-defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142, 0x1c2>, "v_mad_i32_i24",
-  VOP_I32_I32_I32_I32, AMDGPUmad_i24
->;
-defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143, 0x1c3>, "v_mad_u32_u24",
-  VOP_I32_I32_I32_I32, AMDGPUmad_u24
->;
-} // End isCommutable = 1
-
-defm V_CUBEID_F32 : VOP3Inst <vop3<0x144, 0x1c4>, "v_cubeid_f32",
-  VOP_F32_F32_F32_F32
->;
-defm V_CUBESC_F32 : VOP3Inst <vop3<0x145, 0x1c5>, "v_cubesc_f32",
-  VOP_F32_F32_F32_F32
->;
-defm V_CUBETC_F32 : VOP3Inst <vop3<0x146, 0x1c6>, "v_cubetc_f32",
-  VOP_F32_F32_F32_F32
->;
-defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32",
-  VOP_F32_F32_F32_F32
->;
-
-defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32",
-  VOP_I32_I32_I32_I32, AMDGPUbfe_u32
->;
-defm V_BFE_I32 : VOP3Inst <vop3<0x149, 0x1c9>, "v_bfe_i32",
-  VOP_I32_I32_I32_I32, AMDGPUbfe_i32
->;
-
-defm V_BFI_B32 : VOP3Inst <vop3<0x14a, 0x1ca>, "v_bfi_b32",
-  VOP_I32_I32_I32_I32, AMDGPUbfi
->;
-
-let isCommutable = 1 in {
-defm V_FMA_F32 : VOP3Inst <vop3<0x14b, 0x1cb>, "v_fma_f32",
-  VOP_F32_F32_F32_F32, fma
->;
-defm V_FMA_F64 : VOP3Inst <vop3<0x14c, 0x1cc>, "v_fma_f64",
-  VOP_F64_F64_F64_F64, fma
->;
-} // End isCommutable = 1
-
-//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>;
-defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e, 0x1ce>, "v_alignbit_b32",
-  VOP_I32_I32_I32_I32
->;
-defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32",
-  VOP_I32_I32_I32_I32
->;
-
-defm V_MIN3_F32 : VOP3Inst <vop3<0x151, 0x1d0>, "v_min3_f32",
-  VOP_F32_F32_F32_F32, AMDGPUfmin3>;
-
-defm V_MIN3_I32 : VOP3Inst <vop3<0x152, 0x1d1>, "v_min3_i32",
-  VOP_I32_I32_I32_I32, AMDGPUsmin3
->;
-defm V_MIN3_U32 : VOP3Inst <vop3<0x153, 0x1d2>, "v_min3_u32",
-  VOP_I32_I32_I32_I32, AMDGPUumin3
->;
-defm V_MAX3_F32 : VOP3Inst <vop3<0x154, 0x1d3>, "v_max3_f32",
-  VOP_F32_F32_F32_F32, AMDGPUfmax3
->;
-defm V_MAX3_I32 : VOP3Inst <vop3<0x155, 0x1d4>, "v_max3_i32",
-  VOP_I32_I32_I32_I32, AMDGPUsmax3
->;
-defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32",
-  VOP_I32_I32_I32_I32, AMDGPUumax3
->;
-defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32",
-  VOP_F32_F32_F32_F32
->;
-defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32",
-  VOP_I32_I32_I32_I32
->;
-defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32",
-  VOP_I32_I32_I32_I32
->;
-
-//def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>;
-//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>;
-//def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>;
-defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32",
-  VOP_I32_I32_I32_I32
->;
-////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>;
-defm V_DIV_FIXUP_F32 : VOP3Inst <
-  vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
->;
-
-let SchedRW = [WriteDouble] in {
-
-defm V_DIV_FIXUP_F64 : VOP3Inst <
-  vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
->;
-
-} // let SchedRW = [WriteDouble]
-
-let SchedRW = [WriteDouble] in {
-let isCommutable = 1 in {
-
-defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64",
-  VOP_F64_F64_F64, fadd
->;
-defm V_MUL_F64 : VOP3Inst <vop3<0x165, 0x281>, "v_mul_f64",
-  VOP_F64_F64_F64, fmul
->;
-
-defm V_MIN_F64 : VOP3Inst <vop3<0x166, 0x282>, "v_min_f64",
-  VOP_F64_F64_F64, fminnum
->;
-defm V_MAX_F64 : VOP3Inst <vop3<0x167, 0x283>, "v_max_f64",
-  VOP_F64_F64_F64, fmaxnum
->;
-
-} // isCommutable = 1
-
-defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64",
-  VOP_F64_F64_I32, AMDGPUldexp
->;
-
-} // let SchedRW = [WriteDouble]
-
-let isCommutable = 1, SchedRW = [WriteQuarterRate32] in {
-
-defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32",
-  VOP_I32_I32_I32
->;
-defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a, 0x286>, "v_mul_hi_u32",
-  VOP_I32_I32_I32
->;
-
-defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b, 0x285>, "v_mul_lo_i32",
-  VOP_I32_I32_I32
->;
-defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",
-  VOP_I32_I32_I32
->;
-
-} // isCommutable = 1, SchedRW = [WriteQuarterRate32]
-
-let SchedRW = [WriteFloatFMA, WriteSALU] in {
-defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>;
-}
-
-let SchedRW = [WriteDouble, WriteSALU] in {
-// Double precision division pre-scale.
-defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>;
-} // let SchedRW = [WriteDouble]
-
-let isCommutable = 1, Uses = [VCC] in {
-
-// v_div_fmas_f32:
-//   result = src0 * src1 + src2
-//   if (vcc)
-//     result *= 2^32
-//
-defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32",
-  VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
->;
-
-let SchedRW = [WriteDouble] in {
-// v_div_fmas_f64:
-//   result = src0 * src1 + src2
-//   if (vcc)
-//     result *= 2^64
-//
-defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64",
-  VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
->;
-
-} // End SchedRW = [WriteDouble]
-} // End isCommutable = 1
-
-//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
-//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
-//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>;
-
-let SchedRW = [WriteDouble] in {
-defm V_TRIG_PREOP_F64 : VOP3Inst <
-  vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
->;
-
-} // let SchedRW = [WriteDouble]
-
-// These instructions only exist on SI and CI
-let SubtargetPredicate = isSICI in {
-
-defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", VOP_I64_I64_I32>;
-defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", VOP_I64_I64_I32>;
-defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", VOP_I64_I64_I32>;
-
-defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32",
-  VOP_F32_F32_F32_F32>;
-
-} // End SubtargetPredicate = isSICI
-
-let SubtargetPredicate = isVI in {
-
-defm V_LSHLREV_B64 : VOP3Inst <vop3<0, 0x28f>, "v_lshlrev_b64",
-  VOP_I64_I32_I64
->;
-defm V_LSHRREV_B64 : VOP3Inst <vop3<0, 0x290>, "v_lshrrev_b64",
-  VOP_I64_I32_I64
->;
-defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64",
-  VOP_I64_I32_I64
->;
-
-} // End SubtargetPredicate = isVI
-
-//===----------------------------------------------------------------------===//
-// Pseudo Instructions
-//===----------------------------------------------------------------------===//
-let isCodeGenOnly = 1, isPseudo = 1 in {
-
-// For use in patterns
-def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst),
-  (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []
->;
-
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
-// 64-bit vector move instruction.  This is mainly used by the SIFoldOperands
-// pass to enable folding of inline immediates.
-def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>;
-} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0
-
-let hasSideEffects = 1 in {
-def SGPR_USE : InstSI <(outs),(ins), "", []>;
-}
-
-// SI pseudo instructions. These are used by the CFG structurizer pass
-// and should be lowered to ISA instructions prior to codegen.
-
-let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
-let Uses = [EXEC], Defs = [EXEC] in {
-
-let isBranch = 1, isTerminator = 1 in {
-
-def SI_IF: InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$vcc, brtarget:$target),
-  "",
-  [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))]
->;
-
-def SI_ELSE : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src, brtarget:$target),
-  "",
-  [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]
-> {
-  let Constraints = "$src = $dst";
-}
-
-def SI_LOOP : InstSI <
-  (outs),
-  (ins SReg_64:$saved, brtarget:$target),
-  "si_loop $saved, $target",
-  [(int_SI_loop i64:$saved, bb:$target)]
->;
-
-} // end isBranch = 1, isTerminator = 1
-
-def SI_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src),
-  "si_else $dst, $src",
-  [(set i64:$dst, (int_SI_break i64:$src))]
->;
-
-def SI_IF_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$vcc, SReg_64:$src),
-  "si_if_break $dst, $vcc, $src",
-  [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))]
->;
-
-def SI_ELSE_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src0, SReg_64:$src1),
-  "si_else_break $dst, $src0, $src1",
-  [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))]
->;
-
-def SI_END_CF : InstSI <
-  (outs),
-  (ins SReg_64:$saved),
-  "si_end_cf $saved",
-  [(int_SI_end_cf i64:$saved)]
->;
-
-} // End Uses = [EXEC], Defs = [EXEC]
-
-let Uses = [EXEC], Defs = [EXEC,VCC] in {
-def SI_KILL : InstSI <
-  (outs),
-  (ins VSrc_32:$src),
-  "si_kill $src",
-  [(int_AMDGPU_kill f32:$src)]
->;
-} // End Uses = [EXEC], Defs = [EXEC,VCC]
-
-} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
-
-let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
-
-//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>;
-
-let UseNamedOperandTable = 1 in {
-
-def SI_RegisterLoad : InstSI <
-  (outs VGPR_32:$dst, SReg_64:$temp),
-  (ins FRAMEri32:$addr, i32imm:$chan),
-  "", []
-> {
-  let isRegisterLoad = 1;
-  let mayLoad = 1;
-}
-
-class SIRegStore<dag outs> : InstSI <
-  outs,
-  (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan),
-  "", []
-> {
-  let isRegisterStore = 1;
-  let mayStore = 1;
-}
-
-let usesCustomInserter = 1 in {
-def SI_RegisterStorePseudo : SIRegStore<(outs)>;
-} // End usesCustomInserter = 1
-def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
-
-
-} // End UseNamedOperandTable = 1
-
-def SI_INDIRECT_SRC : InstSI <
-  (outs VGPR_32:$dst, SReg_64:$temp),
-  (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
-  "si_indirect_src $dst, $temp, $src, $idx, $off",
-  []
->;
-
-class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
-  (outs rc:$dst, SReg_64:$temp),
-  (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val),
-  "si_indirect_dst $dst, $temp, $src, $idx, $off, $val",
-  []
-> {
-  let Constraints = "$src = $dst";
-}
-
-def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
-def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
-def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
-def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
-def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
-
-} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
-
-multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
-
-  let UseNamedOperandTable = 1 in {
-    def _SAVE : InstSI <
-      (outs),
-      (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
-           SReg_32:$scratch_offset),
-      "", []
-    >;
-
-    def _RESTORE : InstSI <
-      (outs sgpr_class:$dst),
-      (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
-      "", []
-    >;
-  } // End UseNamedOperandTable = 1
-}
-
-// It's unclear whether you can use M0 as the output of v_readlane_b32
-// instructions, so use SGPR_32 register class for spills to prevent
-// this from happening.
-defm SI_SPILL_S32  : SI_SPILL_SGPR <SGPR_32>;
-defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
-defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
-defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
-defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
-
-multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
-  let UseNamedOperandTable = 1, VGPRSpill = 1 in {
-    def _SAVE : InstSI <
-      (outs),
-      (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
-           SReg_32:$scratch_offset),
-      "", []
-    >;
-
-    def _RESTORE : InstSI <
-      (outs vgpr_class:$dst),
-      (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
-      "", []
-    >;
-  } // End UseNamedOperandTable = 1, VGPRSpill = 1
-}
-
-defm SI_SPILL_V32  : SI_SPILL_VGPR <VGPR_32>;
-defm SI_SPILL_V64  : SI_SPILL_VGPR <VReg_64>;
-defm SI_SPILL_V96  : SI_SPILL_VGPR <VReg_96>;
-defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
-defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
-defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
-
-let Defs = [SCC] in {
-
-def SI_CONSTDATA_PTR : InstSI <
-  (outs SReg_64:$dst),
-  (ins),
-  "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))]
->;
-
-} // End Defs = [SCC]
-
-} // end IsCodeGenOnly, isPseudo
-
-} // end SubtargetPredicate = isGCN
-
-let Predicates = [isGCN] in {
-
-def : Pat<
-  (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2),
-  (V_CNDMASK_B32_e64 $src2, $src1,
-                     (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0,
-                                       DSTCLAMP.NONE, DSTOMOD.NONE))
->;
-
-def : Pat <
-  (int_AMDGPU_kilp),
-  (SI_KILL 0xbf800000)
->;
-
-/* int_SI_vs_load_input */
-def : Pat<
-  (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
-  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0)
->;
-
-/* int_SI_export */
-def : Pat <
-  (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
-                 f32:$src0, f32:$src1, f32:$src2, f32:$src3),
-  (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm,
-       $src0, $src1, $src2, $src3)
->;
-
-//===----------------------------------------------------------------------===//
-// SMRD Patterns
-//===----------------------------------------------------------------------===//
-
-multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
-
-  // 1. SI-CI: Offset as 8bit DWORD immediate
-  def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
-    (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
-  >;
-
-  // 2. Offset loaded in an 32bit SGPR
-  def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
-    (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
-  >;
-
-  // 3. No offset at all
-  def : Pat <
-    (constant_load i64:$sbase),
-    (vt (Instr_IMM $sbase, 0))
-  >;
-}
-
-multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
-
-  // 1. VI: Offset as 20bit immediate in bytes
-  def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))),
-    (vt (Instr_IMM $sbase, (as_i32imm $offset)))
-  >;
-
-  // 2. Offset loaded in an 32bit SGPR
-  def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
-    (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
-  >;
-
-  // 3. No offset at all
-  def : Pat <
-    (constant_load i64:$sbase),
-    (vt (Instr_IMM $sbase, 0))
-  >;
-}
-
-let Predicates = [isSICI] in {
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
-} // End Predicates = [isSICI]
-
-let Predicates = [isVI] in {
-defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
-} // End Predicates = [isVI]
-
-let Predicates = [isSICI] in {
-
-// 1. Offset as 8bit DWORD immediate
-def : Pat <
-  (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
->;
-
-} // End Predicates = [isSICI]
-
-// 2. Offset loaded in an 32bit SGPR
-def : Pat <
-  (SIload_constant v4i32:$sbase, imm:$offset),
-  (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
->;
-
-//===----------------------------------------------------------------------===//
-// SOP1 Patterns
-//===----------------------------------------------------------------------===//
-
-def : Pat <
-  (i64 (ctpop i64:$src)),
-    (i64 (REG_SEQUENCE SReg_64,
-     (S_BCNT1_I32_B64 $src), sub0,
-     (S_MOV_B32 0), sub1))
->;
-
-//===----------------------------------------------------------------------===//
-// SOP2 Patterns
-//===----------------------------------------------------------------------===//
-
-// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
-// case, the sgpr-copies pass will fix this to use the vector version.
-def : Pat <
-  (i32 (addc i32:$src0, i32:$src1)),
-  (S_ADD_U32 $src0, $src1)
->;
-
-//===----------------------------------------------------------------------===//
-// SOPP Patterns
-//===----------------------------------------------------------------------===//
-
-def : Pat <
-  (int_AMDGPU_barrier_global),
-  (S_BARRIER)
->;
-
-//===----------------------------------------------------------------------===//
-// VOP1 Patterns
-//===----------------------------------------------------------------------===//
-
-let Predicates = [UnsafeFPMath] in {
-
-//def : RcpPat<V_RCP_F64_e32, f64>;
-//defm : RsqPat<V_RSQ_F64_e32, f64>;
-//defm : RsqPat<V_RSQ_F32_e32, f32>;
-
-def : RsqPat<V_RSQ_F32_e32, f32>;
-def : RsqPat<V_RSQ_F64_e32, f64>;
-}
-
-//===----------------------------------------------------------------------===//
-// VOP2 Patterns
-//===----------------------------------------------------------------------===//
-
-def : Pat <
-  (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
-  (V_BCNT_U32_B32_e64 $popcnt, $val)
->;
-
-def : Pat <
-  (i32 (select i1:$src0, i32:$src1, i32:$src2)),
-  (V_CNDMASK_B32_e64 $src2, $src1, $src0)
->;
-
-/********** ======================= **********/
-/********** Image sampling patterns **********/
-/********** ======================= **********/
-
-// Image + sampler
-class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
-  (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
-        i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
-  (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
-          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
-          $addr, $rsrc, $sampler)
->;
-
-multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>;
-  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>;
-}
-
-// Image only
-class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
-  (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm,
-        i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
-  (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
-          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
-          $addr, $rsrc)
->;
-
-multiclass ImagePatterns<SDPatternOperator name, string opcode> {
-  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
-  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
-  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
-}
-
-// Basic sample
-defm : SampleRawPatterns<int_SI_image_sample,           "IMAGE_SAMPLE">;
-defm : SampleRawPatterns<int_SI_image_sample_cl,        "IMAGE_SAMPLE_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_d,         "IMAGE_SAMPLE_D">;
-defm : SampleRawPatterns<int_SI_image_sample_d_cl,      "IMAGE_SAMPLE_D_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_l,         "IMAGE_SAMPLE_L">;
-defm : SampleRawPatterns<int_SI_image_sample_b,         "IMAGE_SAMPLE_B">;
-defm : SampleRawPatterns<int_SI_image_sample_b_cl,      "IMAGE_SAMPLE_B_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_lz,        "IMAGE_SAMPLE_LZ">;
-defm : SampleRawPatterns<int_SI_image_sample_cd,        "IMAGE_SAMPLE_CD">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_cl,     "IMAGE_SAMPLE_CD_CL">;
-
-// Sample with comparison
-defm : SampleRawPatterns<int_SI_image_sample_c,         "IMAGE_SAMPLE_C">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cl,      "IMAGE_SAMPLE_C_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d,       "IMAGE_SAMPLE_C_D">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_cl,    "IMAGE_SAMPLE_C_D_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_l,       "IMAGE_SAMPLE_C_L">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b,       "IMAGE_SAMPLE_C_B">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_cl,    "IMAGE_SAMPLE_C_B_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_lz,      "IMAGE_SAMPLE_C_LZ">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd,      "IMAGE_SAMPLE_C_CD">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl,   "IMAGE_SAMPLE_C_CD_CL">;
-
-// Sample with offsets
-defm : SampleRawPatterns<int_SI_image_sample_o,         "IMAGE_SAMPLE_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cl_o,      "IMAGE_SAMPLE_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_d_o,       "IMAGE_SAMPLE_D_O">;
-defm : SampleRawPatterns<int_SI_image_sample_d_cl_o,    "IMAGE_SAMPLE_D_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_l_o,       "IMAGE_SAMPLE_L_O">;
-defm : SampleRawPatterns<int_SI_image_sample_b_o,       "IMAGE_SAMPLE_B_O">;
-defm : SampleRawPatterns<int_SI_image_sample_b_cl_o,    "IMAGE_SAMPLE_B_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_lz_o,      "IMAGE_SAMPLE_LZ_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_o,      "IMAGE_SAMPLE_CD_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o,   "IMAGE_SAMPLE_CD_CL_O">;
-
-// Sample with comparison and offsets
-defm : SampleRawPatterns<int_SI_image_sample_c_o,       "IMAGE_SAMPLE_C_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cl_o,    "IMAGE_SAMPLE_C_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_o,     "IMAGE_SAMPLE_C_D_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o,  "IMAGE_SAMPLE_C_D_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_l_o,     "IMAGE_SAMPLE_C_L_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_o,     "IMAGE_SAMPLE_C_B_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o,  "IMAGE_SAMPLE_C_B_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_lz_o,    "IMAGE_SAMPLE_C_LZ_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_o,    "IMAGE_SAMPLE_C_CD_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
-
-// Gather opcodes
-// Only the variants which make sense are defined.
-def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V2,        v2i32>;
-def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V4,        v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl,        IMAGE_GATHER4_CL_V4_V4,     v4i32>;
-def : SampleRawPattern<int_SI_gather4_l,         IMAGE_GATHER4_L_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_b,         IMAGE_GATHER4_B_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V4,   v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V8,   v8i32>;
-def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V2,     v2i32>;
-def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V4,     v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_c,         IMAGE_GATHER4_C_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V4,   v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V8,   v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_cl,    IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz,      IMAGE_GATHER4_C_LZ_V4_V4,   v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_o,         IMAGE_GATHER4_O_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V4,   v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V8,   v8i32>;
-def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl_o,    IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_lz_o,      IMAGE_GATHER4_LZ_O_V4_V4,   v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl_o,    IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_l_o,     IMAGE_GATHER4_C_L_O_V4_V8,  v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_o,     IMAGE_GATHER4_C_B_O_V4_V8,  v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_cl_o,  IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>;
-
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>;
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>;
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
-
-def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>;
-defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">;
-defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">;
-
-/* SIsample for simple 1D texture lookup */
-def : Pat <
-  (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
-  (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
->;
-
-class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
-    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
->;
-
-class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT),
-    (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
->;
-
-class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY),
-    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
->;
-
-class SampleShadowPattern<SDNode name, MIMG opcode,
-                          ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW),
-    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
->;
-
-class SampleShadowArrayPattern<SDNode name, MIMG opcode,
-                               ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
-    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
->;
-
-/* SIsample* for texture lookups consuming more address parameters */
-multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
-                          MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
-MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
-  def : SamplePattern <SIsample, sample, addr_type>;
-  def : SampleRectPattern <SIsample, sample, addr_type>;
-  def : SampleArrayPattern <SIsample, sample, addr_type>;
-  def : SampleShadowPattern <SIsample, sample_c, addr_type>;
-  def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
-
-  def : SamplePattern <SIsamplel, sample_l, addr_type>;
-  def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
-  def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
-  def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
-
-  def : SamplePattern <SIsampleb, sample_b, addr_type>;
-  def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
-  def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
-  def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
-
-  def : SamplePattern <SIsampled, sample_d, addr_type>;
-  def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
-  def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
-  def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
-}
-
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2,
-                      IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2,
-                      IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2,
-                      IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2,
-                      v2i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4,
-                      IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4,
-                      IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4,
-                      IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4,
-                      v4i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8,
-                      IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8,
-                      IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8,
-                      IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8,
-                      v8i32>;
-defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
-                      IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16,
-                      IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16,
-                      IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
-                      v16i32>;
-
-/* int_SI_imageload for texture fetches consuming varying address parameters */
-class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
-    (name addr_type:$addr, v32i8:$rsrc, imm),
-    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
->;
-
-class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
-    (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY),
-    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
->;
-
-class ImageLoadMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
-    (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA),
-    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
->;
-
-class ImageLoadArrayMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
-    (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA),
-    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
->;
-
-multiclass ImageLoadPatterns<MIMG opcode, ValueType addr_type> {
-  def : ImageLoadPattern <int_SI_imageload, opcode, addr_type>;
-  def : ImageLoadArrayPattern <int_SI_imageload, opcode, addr_type>;
-}
-
-multiclass ImageLoadMSAAPatterns<MIMG opcode, ValueType addr_type> {
-  def : ImageLoadMSAAPattern <int_SI_imageload, opcode, addr_type>;
-  def : ImageLoadArrayMSAAPattern <int_SI_imageload, opcode, addr_type>;
-}
-
-defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V2, v2i32>;
-defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V4, v4i32>;
-
-defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V2, v2i32>;
-defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V4, v4i32>;
-
-/* Image resource information */
-def : Pat <
-  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm),
-  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
->;
-
-def : Pat <
-  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY),
-  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
->;
-
-def : Pat <
-  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA),
-  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
->;
-
-/********** ============================================ **********/
-/********** Extraction, Insertion, Building and Casting  **********/
-/********** ============================================ **********/
-
-foreach Index = 0-2 in {
-  def Extract_Element_v2i32_#Index : Extract_Element <
-    i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-  def Insert_Element_v2i32_#Index : Insert_Element <
-    i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-
-  def Extract_Element_v2f32_#Index : Extract_Element <
-    f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-  def Insert_Element_v2f32_#Index : Insert_Element <
-    f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-}
-
-foreach Index = 0-3 in {
-  def Extract_Element_v4i32_#Index : Extract_Element <
-    i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-  def Insert_Element_v4i32_#Index : Insert_Element <
-    i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-
-  def Extract_Element_v4f32_#Index : Extract_Element <
-    f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-  def Insert_Element_v4f32_#Index : Insert_Element <
-    f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-}
-
-foreach Index = 0-7 in {
-  def Extract_Element_v8i32_#Index : Extract_Element <
-    i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-  def Insert_Element_v8i32_#Index : Insert_Element <
-    i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-
-  def Extract_Element_v8f32_#Index : Extract_Element <
-    f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-  def Insert_Element_v8f32_#Index : Insert_Element <
-    f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-}
-
-foreach Index = 0-15 in {
-  def Extract_Element_v16i32_#Index : Extract_Element <
-    i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-  def Insert_Element_v16i32_#Index : Insert_Element <
-    i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-
-  def Extract_Element_v16f32_#Index : Extract_Element <
-    f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-  def Insert_Element_v16f32_#Index : Insert_Element <
-    f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
-}
-
-def : BitConvert <i32, f32, SReg_32>;
-def : BitConvert <i32, f32, VGPR_32>;
-
-def : BitConvert <f32, i32, SReg_32>;
-def : BitConvert <f32, i32, VGPR_32>;
-
-def : BitConvert <i64, f64, VReg_64>;
-
-def : BitConvert <f64, i64, VReg_64>;
-
-def : BitConvert <v2f32, v2i32, VReg_64>;
-def : BitConvert <v2i32, v2f32, VReg_64>;
-def : BitConvert <v2i32, i64, VReg_64>;
-def : BitConvert <i64, v2i32, VReg_64>;
-def : BitConvert <v2f32, i64, VReg_64>;
-def : BitConvert <i64, v2f32, VReg_64>;
-def : BitConvert <v2i32, f64, VReg_64>;
-def : BitConvert <f64, v2i32, VReg_64>;
-def : BitConvert <v4f32, v4i32, VReg_128>;
-def : BitConvert <v4i32, v4f32, VReg_128>;
-
-def : BitConvert <v8f32, v8i32, SReg_256>;
-def : BitConvert <v8i32, v8f32, SReg_256>;
-def : BitConvert <v8i32, v32i8, SReg_256>;
-def : BitConvert <v32i8, v8i32, SReg_256>;
-def : BitConvert <v8i32, v32i8, VReg_256>;
-def : BitConvert <v8i32, v8f32, VReg_256>;
-def : BitConvert <v8f32, v8i32, VReg_256>;
-def : BitConvert <v32i8, v8i32, VReg_256>;
-
-def : BitConvert <v16i32, v16f32, VReg_512>;
-def : BitConvert <v16f32, v16i32, VReg_512>;
-
-/********** =================== **********/
-/********** Src & Dst modifiers **********/
-/********** =================== **********/
-
-def : Pat <
-  (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
-               (f32 FP_ZERO), (f32 FP_ONE)),
-  (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod)
->;
-
-/********** ================================ **********/
-/********** Floating point absolute/negative **********/
-/********** ================================ **********/
-
-// Prevent expanding both fneg and fabs.
-
-// FIXME: Should use S_OR_B32
-def : Pat <
-  (fneg (fabs f32:$src)),
-  (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
->;
-
-// FIXME: Should use S_OR_B32
-def : Pat <
-  (fneg (fabs f64:$src)),
-  (REG_SEQUENCE VReg_64,
-    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
-    sub0,
-    (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
-                  (V_MOV_B32_e32 0x80000000)), // Set sign bit.
-    sub1)
->;
-
-def : Pat <
-  (fabs f32:$src),
-  (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff))
->;
-
-def : Pat <
-  (fneg f32:$src),
-  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000))
->;
-
-def : Pat <
-  (fabs f64:$src),
-  (REG_SEQUENCE VReg_64,
-    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
-    sub0,
-    (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
-                   (V_MOV_B32_e32 0x7fffffff)), // Set sign bit.
-     sub1)
->;
-
-def : Pat <
-  (fneg f64:$src),
-  (REG_SEQUENCE VReg_64,
-    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
-    sub0,
-    (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
-                   (V_MOV_B32_e32 0x80000000)),
-    sub1)
->;
-
-/********** ================== **********/
-/********** Immediate Patterns **********/
-/********** ================== **********/
-
-def : Pat <
-  (SGPRImm<(i32 imm)>:$imm),
-  (S_MOV_B32 imm:$imm)
->;
-
-def : Pat <
-  (SGPRImm<(f32 fpimm)>:$imm),
-  (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
->;
-
-def : Pat <
-  (i32 imm:$imm),
-  (V_MOV_B32_e32 imm:$imm)
->;
-
-def : Pat <
-  (f32 fpimm:$imm),
-  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
->;
-
-def : Pat <
-  (i64 InlineImm<i64>:$imm),
-  (S_MOV_B64 InlineImm<i64>:$imm)
->;
-
-// XXX - Should this use a s_cmp to set SCC?
-
-// Set to sign-extended 64-bit value (true = -1, false = 0)
-def : Pat <
-  (i1 imm:$imm),
-  (S_MOV_B64 (i64 (as_i64imm $imm)))
->;
-
-def : Pat <
-  (f64 InlineFPImm<f64>:$imm),
-  (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm)))
->;
-
-/********** ================== **********/
-/********** Intrinsic Patterns **********/
-/********** ================== **********/
-
-/* llvm.AMDGPU.pow */
-def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
-
-def : Pat <
-  (int_AMDGPU_div f32:$src0, f32:$src1),
-  (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1))
->;
-
-def : Pat <
-  (int_AMDGPU_cube v4f32:$src),
-  (REG_SEQUENCE VReg_128,
-    (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
-                  0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1),
-                  0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2),
-                  0 /* clamp */, 0 /* omod */), sub0,
-    (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
-                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
-                  0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2),
-                  0 /* clamp */, 0 /* omod */), sub1,
-    (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
-                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
-                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
-                  0 /* clamp */, 0 /* omod */), sub2,
-    (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
-                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
-                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
-                  0 /* clamp */, 0 /* omod */), sub3)
->;
-
-def : Pat <
-  (i32 (sext i1:$src0)),
-  (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
->;
-
-class Ext32Pat <SDNode ext> : Pat <
-  (i32 (ext i1:$src0)),
-  (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0)
->;
-
-def : Ext32Pat <zext>;
-def : Ext32Pat <anyext>;
-
-// Offset in an 32Bit VGPR
-def : Pat <
-  (SIload_constant v4i32:$sbase, i32:$voff),
-  (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0)
->;
-
-// The multiplication scales from [0,1] to the unsigned integer range
-def : Pat <
-  (AMDGPUurecip i32:$src0),
-  (V_CVT_U32_F32_e32
-    (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1,
-                   (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
->;
-
-def : Pat <
-  (int_SI_tid),
-  (V_MBCNT_HI_U32_B32_e64 0xffffffff,
-                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0))
->;
-
-//===----------------------------------------------------------------------===//
-// VOP3 Patterns
-//===----------------------------------------------------------------------===//
-
-def : IMad24Pat<V_MAD_I32_I24>;
-def : UMad24Pat<V_MAD_U32_U24>;
-
-def : Pat <
-  (mulhu i32:$src0, i32:$src1),
-  (V_MUL_HI_U32 $src0, $src1)
->;
-
-def : Pat <
-  (mulhs i32:$src0, i32:$src1),
-  (V_MUL_HI_I32 $src0, $src1)
->;
-
-defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
-def : ROTRPattern <V_ALIGNBIT_B32>;
-
-/********** ======================= **********/
-/**********   Load/Store Patterns   **********/
-/********** ======================= **********/
-
-class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
-  (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
-  (inst $ptr, (as_i16imm $offset), (i1 0))
->;
-
-def : DSReadPat <DS_READ_I8,  i32, si_sextload_local_i8>;
-def : DSReadPat <DS_READ_U8,  i32, si_az_extload_local_i8>;
-def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
-def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>;
-def : DSReadPat <DS_READ_B32, i32, si_load_local>;
-
-let AddedComplexity = 100 in {
-
-def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>;
-
-} // End AddedComplexity = 100
-
-def : Pat <
-  (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
-                                                    i8:$offset1))),
-  (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
->;
-
-class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
-  (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
-  (inst $ptr, $value, (as_i16imm $offset), (i1 0))
->;
-
-def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>;
-def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>;
-def : DSWritePat <DS_WRITE_B32, i32, si_store_local>;
-
-let AddedComplexity = 100 in {
-
-def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>;
-} // End AddedComplexity = 100
-
-def : Pat <
-  (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
-                                                               i8:$offset1)),
-  (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0),
-                       (EXTRACT_SUBREG $value, sub1), $offset0, $offset1,
-                       (i1 0))
->;
-
-class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
-  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
-  (inst $ptr, $value, (as_i16imm $offset), (i1 0))
->;
-
-// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
-//
-// We need to use something for the data0, so we set a register to
-// -1. For the non-rtn variants, the manual says it does
-// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max
-// will always do the increment so I'm assuming it's the same.
-//
-// We also load this -1 with s_mov_b32 / s_mov_b64 even though this
-// needs to be a VGPR. The SGPR copy pass will fix this, and it's
-// easier since there is no v_mov_b64.
-class DSAtomicIncRetPat<DS inst, ValueType vt,
-                        Instruction LoadImm, PatFrag frag> : Pat <
-  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
-  (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0))
->;
-
-
-class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
-  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
-  (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
->;
-
-
-// 32-bit atomics.
-def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
-                        S_MOV_B32, si_atomic_load_add_local>;
-def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
-                        S_MOV_B32, si_atomic_load_sub_local>;
-
-def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
-def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
-def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>;
-def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>;
-def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>;
-def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>;
-
-def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
-
-// 64-bit atomics.
-def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
-                        S_MOV_B64, si_atomic_load_add_local>;
-def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
-                        S_MOV_B64, si_atomic_load_sub_local>;
-
-def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
-def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
-def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>;
-def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>;
-def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>;
-def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>;
-def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>;
-def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>;
-
-def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>;
-
-
-//===----------------------------------------------------------------------===//
-// MUBUF Patterns
-//===----------------------------------------------------------------------===//
-
-multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
-                              PatFrag constant_ld> {
-  def : Pat <
-     (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
-                                   i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
-     (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
-  >;
-}
-
-let Predicates = [isSICI] in {
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>;
-} // End Predicates = [isSICI]
-
-class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
-  (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
-                        i32:$soffset, u16imm:$offset))),
-  (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
->;
-
-def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
-
-// BUFFER_LOAD_DWORD*, addr64=0
-multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen,
-                             MUBUF bothen> {
-
-  def : Pat <
-    (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset,
-                                  imm:$offset, 0, 0, imm:$glc, imm:$slc,
-                                  imm:$tfe)),
-    (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
-            (as_i1imm $slc), (as_i1imm $tfe))
-  >;
-
-  def : Pat <
-    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
-                                  imm:$offset, 1, 0, imm:$glc, imm:$slc,
-                                  imm:$tfe)),
-    (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
-           (as_i1imm $tfe))
-  >;
-
-  def : Pat <
-    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
-                                  imm:$offset, 0, 1, imm:$glc, imm:$slc,
-                                  imm:$tfe)),
-    (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
-           (as_i1imm $slc), (as_i1imm $tfe))
-  >;
-
-  def : Pat <
-    (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
-                                  imm:$offset, 1, 1, imm:$glc, imm:$slc,
-                                  imm:$tfe)),
-    (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
-            (as_i1imm $tfe))
-  >;
-}
-
-defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN,
-                         BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>;
-defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN,
-                         BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>;
-defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
-                         BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
-
-class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
-  (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
-                               u16imm:$offset)),
-  (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
->;
-
-def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
-
-/*
-class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
-  (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)),
-  (Instr $value, $srsrc, $vaddr, $offset)
->;
-
-let Predicates = [isSICI] in {
-def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>;
-def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>;
-def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>;
-def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>;
-def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>;
-} // End Predicates = [isSICI]
-
-*/
-
-//===----------------------------------------------------------------------===//
-// MTBUF Patterns
-//===----------------------------------------------------------------------===//
-
-// TBUFFER_STORE_FORMAT_*, addr64=0
-class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat<
-  (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
-                   i32:$soffset, imm:$inst_offset, imm:$dfmt,
-                   imm:$nfmt, imm:$offen, imm:$idxen,
-                   imm:$glc, imm:$slc, imm:$tfe),
-  (opcode
-    $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
-    (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
-    (as_i1imm $slc), (as_i1imm $tfe), $soffset)
->;
-
-def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
-def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
-def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
-def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
-
-let SubtargetPredicate = isCI in {
-
-defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
-  VOP_I32_I32_I32
->;
-defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
-  VOP_I32_I32_I32
->;
-defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
-  VOP_I32_I32_I32
->;
-
-let isCommutable = 1 in {
-defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
-  VOP_I64_I32_I32_I64
->;
-
-// XXX - Does this set VCC?
-defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
-  VOP_I64_I32_I32_I64
->;
-} // End isCommutable = 1
-
-// Remaining instructions:
-// FLAT_*
-// S_CBRANCH_CDBGUSER
-// S_CBRANCH_CDBGSYS
-// S_CBRANCH_CDBGSYS_OR_USER
-// S_CBRANCH_CDBGSYS_AND_USER
-// S_DCACHE_INV_VOL
-// DS_NOP
-// DS_GWS_SEMA_RELEASE_ALL
-// DS_WRAP_RTN_B32
-// DS_CNDXCHG32_RTN_B64
-// DS_WRITE_B96
-// DS_WRITE_B128
-// DS_CONDXCHG32_RTN_B128
-// DS_READ_B96
-// DS_READ_B128
-// BUFFER_LOAD_DWORDX3
-// BUFFER_STORE_DWORDX3
-
-} // End isCI
-
-/********** ====================== **********/
-/**********   Indirect adressing   **********/
-/********** ====================== **********/
-
-multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST IndDst> {
-
-  // 1. Extract with offset
-  def : Pat<
-    (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))),
-    (SI_INDIRECT_SRC $vec, $idx, imm:$off)
-  >;
-
-  // 2. Extract without offset
-  def : Pat<
-    (eltvt (vector_extract vt:$vec, i32:$idx)),
-    (SI_INDIRECT_SRC $vec, $idx, 0)
-  >;
-
-  // 3. Insert with offset
-  def : Pat<
-    (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
-    (IndDst $vec, $idx, imm:$off, $val)
-  >;
-
-  // 4. Insert without offset
-  def : Pat<
-    (vector_insert vt:$vec, eltvt:$val, i32:$idx),
-    (IndDst $vec, $idx, 0, $val)
-  >;
-}
-
-defm : SI_INDIRECT_Pattern <v2f32, f32, SI_INDIRECT_DST_V2>;
-defm : SI_INDIRECT_Pattern <v4f32, f32, SI_INDIRECT_DST_V4>;
-defm : SI_INDIRECT_Pattern <v8f32, f32, SI_INDIRECT_DST_V8>;
-defm : SI_INDIRECT_Pattern <v16f32, f32, SI_INDIRECT_DST_V16>;
-
-defm : SI_INDIRECT_Pattern <v2i32, i32, SI_INDIRECT_DST_V2>;
-defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>;
-defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>;
-defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>;
-
-//===----------------------------------------------------------------------===//
-// Conversion Patterns
-//===----------------------------------------------------------------------===//
-
-def : Pat<(i32 (sext_inreg i32:$src, i1)),
-  (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16
-
-// Handle sext_inreg in i64
-def : Pat <
-  (i64 (sext_inreg i64:$src, i1)),
-  (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16
->;
-
-def : Pat <
-  (i64 (sext_inreg i64:$src, i8)),
-  (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16
->;
-
-def : Pat <
-  (i64 (sext_inreg i64:$src, i16)),
-  (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16
->;
-
-def : Pat <
-  (i64 (sext_inreg i64:$src, i32)),
-  (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16
->;
-
-class ZExt_i64_i32_Pat <SDNode ext> : Pat <
-  (i64 (ext i32:$src)),
-  (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1)
->;
-
-class ZExt_i64_i1_Pat <SDNode ext> : Pat <
-  (i64 (ext i1:$src)),
-    (REG_SEQUENCE VReg_64,
-      (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
-      (S_MOV_B32 0), sub1)
->;
-
-
-def : ZExt_i64_i32_Pat<zext>;
-def : ZExt_i64_i32_Pat<anyext>;
-def : ZExt_i64_i1_Pat<zext>;
-def : ZExt_i64_i1_Pat<anyext>;
-
-def : Pat <
-  (i64 (sext i32:$src)),
-    (REG_SEQUENCE SReg_64, $src, sub0,
-    (S_ASHR_I32 $src, 31), sub1)
->;
-
-def : Pat <
-  (i64 (sext i1:$src)),
-  (REG_SEQUENCE VReg_64,
-    (V_CNDMASK_B32_e64 0, -1, $src), sub0,
-    (V_CNDMASK_B32_e64 0, -1, $src), sub1)
->;
-
-// If we need to perform a logical operation on i1 values, we need to
-// use vector comparisons since there is only one SCC register. Vector
-// comparisions still write to a pair of SGPRs, so treat these as
-// 64-bit comparisons. When legalizing SGPR copies, instructions
-// resulting in the copies from SCC to these instructions will be
-// moved to the VALU.
-def : Pat <
-  (i1 (and i1:$src0, i1:$src1)),
-  (S_AND_B64 $src0, $src1)
->;
-
-def : Pat <
-  (i1 (or i1:$src0, i1:$src1)),
-  (S_OR_B64 $src0, $src1)
->;
-
-def : Pat <
-  (i1 (xor i1:$src0, i1:$src1)),
-  (S_XOR_B64 $src0, $src1)
->;
-
-def : Pat <
-  (f32 (sint_to_fp i1:$src)),
-  (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
->;
-
-def : Pat <
-  (f32 (uint_to_fp i1:$src)),
-  (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src)
->;
-
-def : Pat <
-  (f64 (sint_to_fp i1:$src)),
-  (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
->;
-
-def : Pat <
-  (f64 (uint_to_fp i1:$src)),
-  (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
->;
-
-//===----------------------------------------------------------------------===//
-// Miscellaneous Patterns
-//===----------------------------------------------------------------------===//
-
-def : Pat <
-  (i32 (trunc i64:$a)),
-  (EXTRACT_SUBREG $a, sub0)
->;
-
-def : Pat <
-  (i1 (trunc i32:$a)),
-  (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1)
->;
-
-def : Pat <
-  (i1 (trunc i64:$a)),
-  (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1),
-                    (EXTRACT_SUBREG $a, sub0)), 1)
->;
-
-def : Pat <
-  (i32 (bswap i32:$a)),
-  (V_BFI_B32 (S_MOV_B32 0x00ff00ff),
-             (V_ALIGNBIT_B32 $a, $a, 24),
-             (V_ALIGNBIT_B32 $a, $a, 8))
->;
-
-def : Pat <
-  (f32 (select i1:$src2, f32:$src1, f32:$src0)),
-  (V_CNDMASK_B32_e64 $src0, $src1, $src2)
->;
-
-multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
-  def : Pat <
-    (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
-    (BFM $a, $b)
-  >;
-
-  def : Pat <
-    (vt (add (vt (shl 1, vt:$a)), -1)),
-    (BFM $a, (MOV 0))
-  >;
-}
-
-defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
-// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
-
-def : BFEPattern <V_BFE_U32, S_MOV_B32>;
-
-//===----------------------------------------------------------------------===//
-// Fract Patterns
-//===----------------------------------------------------------------------===//
-
-let Predicates = [isSI] in {
-
-// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
-// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
-// way to implement it is using V_FRACT_F64.
-// The workaround for the V_FRACT bug is:
-//    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
-
-// Convert (x + (-floor(x)) to fract(x)
-def : Pat <
-  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
-             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
-  (V_CNDMASK_B64_PSEUDO
-      $x,
-      (V_MIN_F64
-          SRCMODS.NONE,
-          (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
-          SRCMODS.NONE,
-          (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
-          DSTCLAMP.NONE, DSTOMOD.NONE),
-      (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/))
->;
-
-// Convert floor(x) to (x - fract(x))
-def : Pat <
-  (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
-  (V_ADD_F64
-      $mods,
-      $x,
-      SRCMODS.NEG,
-      (V_CNDMASK_B64_PSEUDO
-         $x,
-         (V_MIN_F64
-             SRCMODS.NONE,
-             (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
-             SRCMODS.NONE,
-             (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
-             DSTCLAMP.NONE, DSTOMOD.NONE),
-         (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)),
-      DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-} // End Predicates = [isSI]
-
-let Predicates = [isCI] in {
-
-// Convert (x - floor(x)) to fract(x)
-def : Pat <
-  (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
-             (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
-  (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-// Convert (x + (-floor(x))) to fract(x)
-def : Pat <
-  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
-             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
-  (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-} // End Predicates = [isCI]
-
-//============================================================================//
-// Miscellaneous Optimization Patterns
-//============================================================================//
-
-def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
-
-//============================================================================//
-// Assembler aliases
-//============================================================================//
-
-def : MnemonicAlias<"v_add_u32", "v_add_i32">;
-def : MnemonicAlias<"v_sub_u32", "v_sub_i32">;
-def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">;
-
-} // End isGCN predicate
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
deleted file mode 100644
index 027a0a2f516..00000000000
--- a/lib/Target/R600/SIIntrinsics.td
+++ /dev/null
@@ -1,199 +0,0 @@
-//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// SI Intrinsic Definitions
-//
-//===----------------------------------------------------------------------===//
-
-
-let TargetPrefix = "SI", isTarget = 1 in {
-
-  def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
-  def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
-  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
-
-  // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
-  def int_SI_tbuffer_store : Intrinsic <
-    [],
-    [llvm_anyint_ty, // rsrc(SGPR)
-     llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32
-     llvm_i32_ty,    // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW
-     llvm_i32_ty,    // vaddr(VGPR)
-     llvm_i32_ty,    // soffset(SGPR)
-     llvm_i32_ty,    // inst_offset(imm)
-     llvm_i32_ty,    // dfmt(imm)
-     llvm_i32_ty,    // nfmt(imm)
-     llvm_i32_ty,    // offen(imm)
-     llvm_i32_ty,    // idxen(imm)
-     llvm_i32_ty,    // glc(imm)
-     llvm_i32_ty,    // slc(imm)
-     llvm_i32_ty],   // tfe(imm)
-    []>;
-
-  // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed
-  def int_SI_buffer_load_dword : Intrinsic <
-    [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32
-    [llvm_anyint_ty,  // rsrc(SGPR)
-     llvm_anyint_ty,  // vaddr(VGPR)
-     llvm_i32_ty,     // soffset(SGPR)
-     llvm_i32_ty,     // inst_offset(imm)
-     llvm_i32_ty,     // offen(imm)
-     llvm_i32_ty,     // idxen(imm)
-     llvm_i32_ty,     // glc(imm)
-     llvm_i32_ty,     // slc(imm)
-     llvm_i32_ty],    // tfe(imm)
-    [IntrReadArgMem]>;
-
-  def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-
-  // Fully-flexible SAMPLE instruction.
-  class SampleRaw : Intrinsic <
-    [llvm_v4f32_ty],    // vdata(VGPR)
-    [llvm_anyint_ty,    // vaddr(VGPR)
-     llvm_v8i32_ty,     // rsrc(SGPR)
-     llvm_v4i32_ty,     // sampler(SGPR)
-     llvm_i32_ty,       // dmask(imm)
-     llvm_i32_ty,       // unorm(imm)
-     llvm_i32_ty,       // r128(imm)
-     llvm_i32_ty,       // da(imm)
-     llvm_i32_ty,       // glc(imm)
-     llvm_i32_ty,       // slc(imm)
-     llvm_i32_ty,       // tfe(imm)
-     llvm_i32_ty],      // lwe(imm)
-    [IntrNoMem]>;
-
-  // Image instruction without a sampler.
-  class Image : Intrinsic <
-    [llvm_v4f32_ty],    // vdata(VGPR)
-    [llvm_anyint_ty,    // vaddr(VGPR)
-     llvm_v8i32_ty,     // rsrc(SGPR)
-     llvm_i32_ty,       // dmask(imm)
-     llvm_i32_ty,       // unorm(imm)
-     llvm_i32_ty,       // r128(imm)
-     llvm_i32_ty,       // da(imm)
-     llvm_i32_ty,       // glc(imm)
-     llvm_i32_ty,       // slc(imm)
-     llvm_i32_ty,       // tfe(imm)
-     llvm_i32_ty],      // lwe(imm)
-    [IntrNoMem]>;
-
-  // Basic sample
-  def int_SI_image_sample : SampleRaw;
-  def int_SI_image_sample_cl : SampleRaw;
-  def int_SI_image_sample_d : SampleRaw;
-  def int_SI_image_sample_d_cl : SampleRaw;
-  def int_SI_image_sample_l : SampleRaw;
-  def int_SI_image_sample_b : SampleRaw;
-  def int_SI_image_sample_b_cl : SampleRaw;
-  def int_SI_image_sample_lz : SampleRaw;
-  def int_SI_image_sample_cd : SampleRaw;
-  def int_SI_image_sample_cd_cl : SampleRaw;
-
-  // Sample with comparison
-  def int_SI_image_sample_c : SampleRaw;
-  def int_SI_image_sample_c_cl : SampleRaw;
-  def int_SI_image_sample_c_d : SampleRaw;
-  def int_SI_image_sample_c_d_cl : SampleRaw;
-  def int_SI_image_sample_c_l : SampleRaw;
-  def int_SI_image_sample_c_b : SampleRaw;
-  def int_SI_image_sample_c_b_cl : SampleRaw;
-  def int_SI_image_sample_c_lz : SampleRaw;
-  def int_SI_image_sample_c_cd : SampleRaw;
-  def int_SI_image_sample_c_cd_cl : SampleRaw;
-
-  // Sample with offsets
-  def int_SI_image_sample_o : SampleRaw;
-  def int_SI_image_sample_cl_o : SampleRaw;
-  def int_SI_image_sample_d_o : SampleRaw;
-  def int_SI_image_sample_d_cl_o : SampleRaw;
-  def int_SI_image_sample_l_o : SampleRaw;
-  def int_SI_image_sample_b_o : SampleRaw;
-  def int_SI_image_sample_b_cl_o : SampleRaw;
-  def int_SI_image_sample_lz_o : SampleRaw;
-  def int_SI_image_sample_cd_o : SampleRaw;
-  def int_SI_image_sample_cd_cl_o : SampleRaw;
-
-  // Sample with comparison and offsets
-  def int_SI_image_sample_c_o : SampleRaw;
-  def int_SI_image_sample_c_cl_o : SampleRaw;
-  def int_SI_image_sample_c_d_o : SampleRaw;
-  def int_SI_image_sample_c_d_cl_o : SampleRaw;
-  def int_SI_image_sample_c_l_o : SampleRaw;
-  def int_SI_image_sample_c_b_o : SampleRaw;
-  def int_SI_image_sample_c_b_cl_o : SampleRaw;
-  def int_SI_image_sample_c_lz_o : SampleRaw;
-  def int_SI_image_sample_c_cd_o : SampleRaw;
-  def int_SI_image_sample_c_cd_cl_o : SampleRaw;
-
-  // Basic gather4
-  def int_SI_gather4 : SampleRaw;
-  def int_SI_gather4_cl : SampleRaw;
-  def int_SI_gather4_l : SampleRaw;
-  def int_SI_gather4_b : SampleRaw;
-  def int_SI_gather4_b_cl : SampleRaw;
-  def int_SI_gather4_lz : SampleRaw;
-
-  // Gather4 with comparison
-  def int_SI_gather4_c : SampleRaw;
-  def int_SI_gather4_c_cl : SampleRaw;
-  def int_SI_gather4_c_l : SampleRaw;
-  def int_SI_gather4_c_b : SampleRaw;
-  def int_SI_gather4_c_b_cl : SampleRaw;
-  def int_SI_gather4_c_lz : SampleRaw;
-
-  // Gather4 with offsets
-  def int_SI_gather4_o : SampleRaw;
-  def int_SI_gather4_cl_o : SampleRaw;
-  def int_SI_gather4_l_o : SampleRaw;
-  def int_SI_gather4_b_o : SampleRaw;
-  def int_SI_gather4_b_cl_o : SampleRaw;
-  def int_SI_gather4_lz_o : SampleRaw;
-
-  // Gather4 with comparison and offsets
-  def int_SI_gather4_c_o : SampleRaw;
-  def int_SI_gather4_c_cl_o : SampleRaw;
-  def int_SI_gather4_c_l_o : SampleRaw;
-  def int_SI_gather4_c_b_o : SampleRaw;
-  def int_SI_gather4_c_b_cl_o : SampleRaw;
-  def int_SI_gather4_c_lz_o : SampleRaw;
-
-  def int_SI_getlod : SampleRaw;
-
-  // Image instrinsics.
-  def int_SI_image_load : Image;
-  def int_SI_image_load_mip : Image;
-  def int_SI_getresinfo : Image;
-
-  // Deprecated image and sample intrinsics.
-  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
-
-  def int_SI_sample : Sample;
-  def int_SI_sampleb : Sample;
-  def int_SI_sampled : Sample;
-  def int_SI_samplel : Sample;
-  def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
-
-  /* Interpolation Intrinsics */
-
-  def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>;
-
-  /* Control flow Intrinsics */
-
-  def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
-  def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>;
-  def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
-  def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>;
-  def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>;
-  def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>;
-  def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>;
-}
diff --git a/lib/Target/R600/SILoadStoreOptimizer.cpp b/lib/Target/R600/SILoadStoreOptimizer.cpp
deleted file mode 100644
index 9b1d256dc5a..00000000000
--- a/lib/Target/R600/SILoadStoreOptimizer.cpp
+++ /dev/null
@@ -1,421 +0,0 @@
-//===-- SILoadStoreOptimizer.cpp ------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass tries to fuse DS instructions with close by immediate offsets.
-// This will fuse operations such as
-//  ds_read_b32 v0, v2 offset:16
-//  ds_read_b32 v1, v2 offset:32
-// ==>
-//   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
-//
-//
-// Future improvements:
-//
-// - This currently relies on the scheduler to place loads and stores next to
-//   each other, and then only merges adjacent pairs of instructions. It would
-//   be good to be more flexible with interleaved instructions, and possibly run
-//   before scheduling. It currently missing stores of constants because loading
-//   the constant into the data register is placed between the stores, although
-//   this is arguably a scheduling problem.
-//
-// - Live interval recomputing seems inefficient. This currently only matches
-//   one pair, and recomputes live intervals and moves on to the next pair. It
-//   would be better to compute a list of all merges that need to occur
-//
-// - With a list of instructions to process, we can also merge more. If a
-//   cluster of loads have offsets that are too large to fit in the 8-bit
-//   offsets, but are close enough to fit in the 8 bits, we can add to the base
-//   pointer and use the new reduced offsets.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/LiveVariables.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-load-store-opt"
-
-namespace {
-
-class SILoadStoreOptimizer : public MachineFunctionPass {
-private:
-  const SIInstrInfo *TII;
-  const SIRegisterInfo *TRI;
-  MachineRegisterInfo *MRI;
-  LiveIntervals *LIS;
-
-
-  static bool offsetsCanBeCombined(unsigned Offset0,
-                                   unsigned Offset1,
-                                   unsigned EltSize);
-
-  MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
-                                                 unsigned EltSize);
-
-  void updateRegDefsUses(unsigned SrcReg,
-                         unsigned DstReg,
-                         unsigned SubIdx);
-
-  MachineBasicBlock::iterator mergeRead2Pair(
-    MachineBasicBlock::iterator I,
-    MachineBasicBlock::iterator Paired,
-    unsigned EltSize);
-
-  MachineBasicBlock::iterator mergeWrite2Pair(
-    MachineBasicBlock::iterator I,
-    MachineBasicBlock::iterator Paired,
-    unsigned EltSize);
-
-public:
-  static char ID;
-
-  SILoadStoreOptimizer()
-      : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
-        LIS(nullptr) {}
-
-  SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
-    initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool optimizeBlock(MachineBasicBlock &MBB);
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI Load / Store Optimizer";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addPreserved<SlotIndexes>();
-    AU.addPreserved<LiveIntervals>();
-    AU.addPreserved<LiveVariables>();
-    AU.addRequired<LiveIntervals>();
-
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
-                      "SI Load / Store Optimizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(LiveVariables)
-INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
-INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
-                    "SI Load / Store Optimizer", false, false)
-
-char SILoadStoreOptimizer::ID = 0;
-
-char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
-
-FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
-  return new SILoadStoreOptimizer(TM);
-}
-
-bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
-                                                unsigned Offset1,
-                                                unsigned Size) {
-  // XXX - Would the same offset be OK? Is there any reason this would happen or
-  // be useful?
-  if (Offset0 == Offset1)
-    return false;
-
-  // This won't be valid if the offset isn't aligned.
-  if ((Offset0 % Size != 0) || (Offset1 % Size != 0))
-    return false;
-
-  unsigned EltOffset0 = Offset0 / Size;
-  unsigned EltOffset1 = Offset1 / Size;
-
-  // Check if the new offsets fit in the reduced 8-bit range.
-  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1))
-    return true;
-
-  // If the offset in elements doesn't fit in 8-bits, we might be able to use
-  // the stride 64 versions.
-  if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0)
-    return false;
-
-  return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64);
-}
-
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
-                                         unsigned EltSize){
-  MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineBasicBlock::iterator MBBI = I;
-  ++MBBI;
-
-  if (MBBI->getOpcode() != I->getOpcode())
-    return E;
-
-  // Don't merge volatiles.
-  if (MBBI->hasOrderedMemoryRef())
-    return E;
-
-  int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
-  const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
-  const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
-
-  // Check same base pointer. Be careful of subregisters, which can occur with
-  // vectors of pointers.
-  if (AddrReg0.getReg() == AddrReg1.getReg() &&
-      AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
-    int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
-                                               AMDGPU::OpName::offset);
-    unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
-    unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
-
-    // Check both offsets fit in the reduced range.
-    if (offsetsCanBeCombined(Offset0, Offset1, EltSize))
-      return MBBI;
-  }
-
-  return E;
-}
-
-void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg,
-                                             unsigned DstReg,
-                                             unsigned SubIdx) {
-  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg),
-         E = MRI->reg_end(); I != E; ) {
-    MachineOperand &O = *I;
-    ++I;
-    O.substVirtReg(DstReg, SubIdx, *TRI);
-  }
-}
-
-MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
-  MachineBasicBlock::iterator I,
-  MachineBasicBlock::iterator Paired,
-  unsigned EltSize) {
-  MachineBasicBlock *MBB = I->getParent();
-
-  // Be careful, since the addresses could be subregisters themselves in weird
-  // cases, like vectors of pointers.
-  const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
-
-  unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg();
-  unsigned DestReg1
-    = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg();
-
-  unsigned Offset0
-          = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
-  unsigned Offset1
-    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
-
-  unsigned NewOffset0 = Offset0 / EltSize;
-  unsigned NewOffset1 = Offset1 / EltSize;
-  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
-
-  // Prefer the st64 form if we can use it, even if we can fit the offset in the
-  // non st64 version. I'm not sure if there's any real reason to do this.
-  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
-  if (UseST64) {
-    NewOffset0 /= 64;
-    NewOffset1 /= 64;
-    Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
-  }
-
-  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
-         (NewOffset0 != NewOffset1) &&
-         "Computed offset doesn't fit");
-
-  const MCInstrDesc &Read2Desc = TII->get(Opc);
-
-  const TargetRegisterClass *SuperRC
-    = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
-  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
-
-  DebugLoc DL = I->getDebugLoc();
-  MachineInstrBuilder Read2
-    = BuildMI(*MBB, I, DL, Read2Desc, DestReg)
-    .addOperand(*AddrReg) // addr
-    .addImm(NewOffset0) // offset0
-    .addImm(NewOffset1) // offset1
-    .addImm(0) // gds
-    .addMemOperand(*I->memoperands_begin())
-    .addMemOperand(*Paired->memoperands_begin());
-
-  unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
-  unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
-  updateRegDefsUses(DestReg0, DestReg, SubRegIdx0);
-  updateRegDefsUses(DestReg1, DestReg, SubRegIdx1);
-
-  LIS->RemoveMachineInstrFromMaps(I);
-  // Replacing Paired in the maps with Read2 allows us to avoid updating the
-  // live range for the m0 register.
-  LIS->ReplaceMachineInstrInMaps(Paired, Read2);
-  I->eraseFromParent();
-  Paired->eraseFromParent();
-
-  LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
-  LIS->shrinkToUses(&AddrRegLI);
-
-  LIS->getInterval(DestReg); // Create new LI
-
-  DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
-  return Read2.getInstr();
-}
-
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
-  MachineBasicBlock::iterator I,
-  MachineBasicBlock::iterator Paired,
-  unsigned EltSize) {
-  MachineBasicBlock *MBB = I->getParent();
-
-  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
-  // sure we preserve the subregister index and any register flags set on them.
-  const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
-  const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
-  const MachineOperand *Data1
-    = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
-
-
-  unsigned Offset0
-    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
-  unsigned Offset1
-    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
-
-  unsigned NewOffset0 = Offset0 / EltSize;
-  unsigned NewOffset1 = Offset1 / EltSize;
-  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
-
-  // Prefer the st64 form if we can use it, even if we can fit the offset in the
-  // non st64 version. I'm not sure if there's any real reason to do this.
-  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
-  if (UseST64) {
-    NewOffset0 /= 64;
-    NewOffset1 /= 64;
-    Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
-  }
-
-  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
-         (NewOffset0 != NewOffset1) &&
-         "Computed offset doesn't fit");
-
-  const MCInstrDesc &Write2Desc = TII->get(Opc);
-  DebugLoc DL = I->getDebugLoc();
-
-  // repairLiveintervalsInRange() doesn't handle physical register, so we have
-  // to update the M0 range manually.
-  SlotIndex PairedIndex = LIS->getInstructionIndex(Paired);
-  LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
-  LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
-  bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
-
-  MachineInstrBuilder Write2
-    = BuildMI(*MBB, I, DL, Write2Desc)
-    .addOperand(*Addr) // addr
-    .addOperand(*Data0) // data0
-    .addOperand(*Data1) // data1
-    .addImm(NewOffset0) // offset0
-    .addImm(NewOffset1) // offset1
-    .addImm(0) // gds
-    .addMemOperand(*I->memoperands_begin())
-    .addMemOperand(*Paired->memoperands_begin());
-
-  // XXX - How do we express subregisters here?
-  unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
-
-  LIS->RemoveMachineInstrFromMaps(I);
-  LIS->RemoveMachineInstrFromMaps(Paired);
-  I->eraseFromParent();
-  Paired->eraseFromParent();
-
-  // This doesn't handle physical registers like M0
-  LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
-
-  if (UpdateM0Range) {
-    SlotIndex Write2Index = LIS->getInstructionIndex(Write2);
-    M0Segment->end = Write2Index.getRegSlot();
-  }
-
-  DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
-  return Write2.getInstr();
-}
-
-// Scan through looking for adjacent LDS operations with constant offsets from
-// the same base register. We rely on the scheduler to do the hard work of
-// clustering nearby loads, and assume these are all adjacent.
-bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
-  bool Modified = false;
-
-  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
-    MachineInstr &MI = *I;
-
-    // Don't combine if volatile.
-    if (MI.hasOrderedMemoryRef()) {
-      ++I;
-      continue;
-    }
-
-    unsigned Opc = MI.getOpcode();
-    if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
-      unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
-      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
-      if (Match != E) {
-        Modified = true;
-        I = mergeRead2Pair(I, Match, Size);
-      } else {
-        ++I;
-      }
-
-      continue;
-    } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
-      unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
-      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
-      if (Match != E) {
-        Modified = true;
-        I = mergeWrite2Pair(I, Match, Size);
-      } else {
-        ++I;
-      }
-
-      continue;
-    }
-
-    ++I;
-  }
-
-  return Modified;
-}
-
-bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
-  const TargetSubtargetInfo &STM = MF.getSubtarget();
-  TRI = static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
-  TII = static_cast<const SIInstrInfo *>(STM.getInstrInfo());
-  MRI = &MF.getRegInfo();
-
-  LIS = &getAnalysis<LiveIntervals>();
-
-  DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
-
-  assert(!MRI->isSSA());
-
-  bool Modified = false;
-
-  for (MachineBasicBlock &MBB : MF)
-    Modified |= optimizeBlock(MBB);
-
-  return Modified;
-}
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
deleted file mode 100644
index c319b32111f..00000000000
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ /dev/null
@@ -1,605 +0,0 @@
-//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief This pass lowers the pseudo control flow instructions to real
-/// machine instructions.
-///
-/// All control flow is handled using predicated instructions and
-/// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
-/// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
-/// by writting to the 64-bit EXEC register (each bit corresponds to a
-/// single vector ALU).  Typically, for predicates, a vector ALU will write
-/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
-/// Vector ALU) and then the ScalarALU will AND the VCC register with the
-/// EXEC to update the predicates.
-///
-/// For example:
-/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
-/// %SGPR0 = SI_IF %VCC
-///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
-/// %SGPR0 = SI_ELSE %SGPR0
-///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
-/// SI_END_CF %SGPR0
-///
-/// becomes:
-///
-/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
-/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
-/// S_CBRANCH_EXECZ label0            // This instruction is an optional
-///                                   // optimization which allows us to
-///                                   // branch if all the bits of
-///                                   // EXEC are zero.
-/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
-///
-/// label0:
-/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
-/// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
-/// S_BRANCH_EXECZ label1              // Use our branch optimization
-///                                    // instruction again.
-/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
-/// label1:
-/// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Constants.h"
-
-using namespace llvm;
-
-namespace {
-
-class SILowerControlFlowPass : public MachineFunctionPass {
-
-private:
-  static const unsigned SkipThreshold = 12;
-
-  static char ID;
-  const SIRegisterInfo *TRI;
-  const SIInstrInfo *TII;
-
-  bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
-
-  void Skip(MachineInstr &From, MachineOperand &To);
-  void SkipIfDead(MachineInstr &MI);
-
-  void If(MachineInstr &MI);
-  void Else(MachineInstr &MI);
-  void Break(MachineInstr &MI);
-  void IfBreak(MachineInstr &MI);
-  void ElseBreak(MachineInstr &MI);
-  void Loop(MachineInstr &MI);
-  void EndCf(MachineInstr &MI);
-
-  void Kill(MachineInstr &MI);
-  void Branch(MachineInstr &MI);
-
-  void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
-  void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
-  void IndirectSrc(MachineInstr &MI);
-  void IndirectDst(MachineInstr &MI);
-
-public:
-  SILowerControlFlowPass(TargetMachine &tm) :
-    MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI Lower control flow instructions";
-  }
-
-};
-
-} // End anonymous namespace
-
-char SILowerControlFlowPass::ID = 0;
-
-FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
-  return new SILowerControlFlowPass(tm);
-}
-
-bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
-                                        MachineBasicBlock *To) {
-
-  unsigned NumInstr = 0;
-
-  for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
-       MBB = *MBB->succ_begin()) {
-
-    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-         NumInstr < SkipThreshold && I != E; ++I) {
-
-      if (I->isBundle() || !I->isBundled())
-        if (++NumInstr >= SkipThreshold)
-          return true;
-    }
-  }
-
-  return false;
-}
-
-void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
-
-  if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
-    return;
-
-  DebugLoc DL = From.getDebugLoc();
-  BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
-          .addOperand(To)
-          .addReg(AMDGPU::EXEC);
-}
-
-void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
-
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-
-  if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
-      ShaderType::PIXEL ||
-      !shouldSkip(&MBB, &MBB.getParent()->back()))
-    return;
-
-  MachineBasicBlock::iterator Insert = &MI;
-  ++Insert;
-
-  // If the exec mask is non-zero, skip the next two instructions
-  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-          .addImm(3)
-          .addReg(AMDGPU::EXEC);
-
-  // Exec mask is zero: Export to NULL target...
-  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
-          .addImm(0)
-          .addImm(0x09) // V_008DFC_SQ_EXP_NULL
-          .addImm(0)
-          .addImm(1)
-          .addImm(1)
-          .addReg(AMDGPU::VGPR0)
-          .addReg(AMDGPU::VGPR0)
-          .addReg(AMDGPU::VGPR0)
-          .addReg(AMDGPU::VGPR0);
-
-  // ... and terminate wavefront
-  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
-}
-
-void SILowerControlFlowPass::If(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-  unsigned Reg = MI.getOperand(0).getReg();
-  unsigned Vcc = MI.getOperand(1).getReg();
-
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
-          .addReg(Vcc);
-
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
-          .addReg(AMDGPU::EXEC)
-          .addReg(Reg);
-
-  Skip(MI, MI.getOperand(2));
-
-  MI.eraseFromParent();
-}
-
-void SILowerControlFlowPass::Else(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-  unsigned Dst = MI.getOperand(0).getReg();
-  unsigned Src = MI.getOperand(1).getReg();
-
-  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
-          TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
-          .addReg(Src); // Saved EXEC
-
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
-          .addReg(AMDGPU::EXEC)
-          .addReg(Dst);
-
-  Skip(MI, MI.getOperand(2));
-
-  MI.eraseFromParent();
-}
-
-void SILowerControlFlowPass::Break(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-
-  unsigned Dst = MI.getOperand(0).getReg();
-  unsigned Src = MI.getOperand(1).getReg();
- 
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
-          .addReg(AMDGPU::EXEC)
-          .addReg(Src);
-
-  MI.eraseFromParent();
-}
-
-void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-
-  unsigned Dst = MI.getOperand(0).getReg();
-  unsigned Vcc = MI.getOperand(1).getReg();
-  unsigned Src = MI.getOperand(2).getReg();
- 
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
-          .addReg(Vcc)
-          .addReg(Src);
-
-  MI.eraseFromParent();
-}
-
-void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-
-  unsigned Dst = MI.getOperand(0).getReg();
-  unsigned Saved = MI.getOperand(1).getReg();
-  unsigned Src = MI.getOperand(2).getReg();
- 
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
-          .addReg(Saved)
-          .addReg(Src);
-
-  MI.eraseFromParent();
-}
-
-void SILowerControlFlowPass::Loop(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-  unsigned Src = MI.getOperand(0).getReg();
-
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
-          .addReg(AMDGPU::EXEC)
-          .addReg(Src);
-
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-          .addOperand(MI.getOperand(1))
-          .addReg(AMDGPU::EXEC);
-
-  MI.eraseFromParent();
-}
-
-void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-  unsigned Reg = MI.getOperand(0).getReg();
-
-  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
-          TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
-          .addReg(AMDGPU::EXEC)
-          .addReg(Reg);
-
-  MI.eraseFromParent();
-}
-
-void SILowerControlFlowPass::Branch(MachineInstr &MI) {
-  if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
-    MI.eraseFromParent();
-
-  // If these aren't equal, this is probably an infinite loop.
-}
-
-void SILowerControlFlowPass::Kill(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-  const MachineOperand &Op = MI.getOperand(0);
-
-#ifndef NDEBUG
-  const SIMachineFunctionInfo *MFI
-    = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
-  // Kill is only allowed in pixel / geometry shaders.
-  assert(MFI->getShaderType() == ShaderType::PIXEL ||
-         MFI->getShaderType() == ShaderType::GEOMETRY);
-#endif
-
-  // Clear this thread from the exec mask if the operand is negative
-  if ((Op.isImm())) {
-    // Constant operand: Set exec mask to 0 or do nothing
-    if (Op.getImm() & 0x80000000) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
-              .addImm(0);
-    }
-  } else {
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
-           .addImm(0)
-           .addOperand(Op);
-  }
-
-  MI.eraseFromParent();
-}
-
-void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
-
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-  MachineBasicBlock::iterator I = MI;
-
-  unsigned Save = MI.getOperand(1).getReg();
-  unsigned Idx = MI.getOperand(3).getReg();
-
-  if (AMDGPU::SReg_32RegClass.contains(Idx)) {
-    if (Offset) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-              .addReg(Idx)
-              .addImm(Offset);
-    } else {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-              .addReg(Idx);
-    }
-    MBB.insert(I, MovRel);
-  } else {
-
-    assert(AMDGPU::SReg_64RegClass.contains(Save));
-    assert(AMDGPU::VGPR_32RegClass.contains(Idx));
-
-    // Save the EXEC mask
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
-            .addReg(AMDGPU::EXEC);
-
-    // Read the next variant into VCC (lower 32 bits) <- also loop target
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-            AMDGPU::VCC_LO)
-            .addReg(Idx);
-
-    // Move index from VCC into M0
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-            .addReg(AMDGPU::VCC_LO);
-
-    // Compare the just read M0 value to all possible Idx values
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
-            .addReg(AMDGPU::M0)
-            .addReg(Idx);
-
-    // Update EXEC, save the original EXEC value to VCC
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
-            .addReg(AMDGPU::VCC);
-
-    if (Offset) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-              .addReg(AMDGPU::M0)
-              .addImm(Offset);
-    }
-    // Do the actual move
-    MBB.insert(I, MovRel);
-
-    // Update EXEC, switch all done bits to 0 and all todo bits to 1
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
-            .addReg(AMDGPU::EXEC)
-            .addReg(AMDGPU::VCC);
-
-    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-            .addImm(-7)
-            .addReg(AMDGPU::EXEC);
-
-    // Restore EXEC
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
-            .addReg(Save);
-
-  }
-  MI.eraseFromParent();
-}
-
-/// \param @VecReg The register which holds element zero of the vector
-///                 being addressed into.
-/// \param[out] @Reg The base register to use in the indirect addressing instruction.
-/// \param[in,out] @Offset As an input, this is the constant offset part of the
-//                         indirect Index. e.g. v0 = v[VecReg + Offset]
-//                         As an output, this is a constant value that needs
-//                         to be added to the value stored in M0.
-void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg,
-                                                         unsigned &Reg,
-                                                         int &Offset) {
-  unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
-  if (!SubReg)
-    SubReg = VecReg;
-
-  const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
-  int RegIdx = TRI->getHWRegIndex(SubReg) + Offset;
-
-  if (RegIdx < 0) {
-    Offset = RegIdx;
-    RegIdx = 0;
-  } else {
-    Offset = 0;
-  }
-
-  Reg = RC->getRegister(RegIdx);
-}
-
-void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
-
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-
-  unsigned Dst = MI.getOperand(0).getReg();
-  unsigned Vec = MI.getOperand(2).getReg();
-  int Off = MI.getOperand(4).getImm();
-  unsigned Reg;
-
-  computeIndirectRegAndOffset(Vec, Reg, Off);
-
-  MachineInstr *MovRel =
-    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
-            .addReg(Reg)
-            .addReg(AMDGPU::M0, RegState::Implicit)
-            .addReg(Vec, RegState::Implicit);
-
-  LoadM0(MI, MovRel, Off);
-}
-
-void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
-
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-
-  unsigned Dst = MI.getOperand(0).getReg();
-  int Off = MI.getOperand(4).getImm();
-  unsigned Val = MI.getOperand(5).getReg();
-  unsigned Reg;
-
-  computeIndirectRegAndOffset(Dst, Reg, Off);
-
-  MachineInstr *MovRel = 
-    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
-            .addReg(Reg, RegState::Define)
-            .addReg(Val)
-            .addReg(AMDGPU::M0, RegState::Implicit)
-            .addReg(Dst, RegState::Implicit);
-
-  LoadM0(MI, MovRel, Off);
-}
-
-bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  TRI =
-      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
-  bool HaveKill = false;
-  bool NeedWQM = false;
-  bool NeedFlat = false;
-  unsigned Depth = 0;
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
-    MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
-      Next = std::next(I);
-
-      MachineInstr &MI = *I;
-      if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode()))
-        NeedWQM = true;
-
-      // Flat uses m0 in case it needs to access LDS.
-      if (TII->isFLAT(MI.getOpcode()))
-        NeedFlat = true;
-
-      switch (MI.getOpcode()) {
-        default: break;
-        case AMDGPU::SI_IF:
-          ++Depth;
-          If(MI);
-          break;
-
-        case AMDGPU::SI_ELSE:
-          Else(MI);
-          break;
-
-        case AMDGPU::SI_BREAK:
-          Break(MI);
-          break;
-
-        case AMDGPU::SI_IF_BREAK:
-          IfBreak(MI);
-          break;
-
-        case AMDGPU::SI_ELSE_BREAK:
-          ElseBreak(MI);
-          break;
-
-        case AMDGPU::SI_LOOP:
-          ++Depth;
-          Loop(MI);
-          break;
-
-        case AMDGPU::SI_END_CF:
-          if (--Depth == 0 && HaveKill) {
-            SkipIfDead(MI);
-            HaveKill = false;
-          }
-          EndCf(MI);
-          break;
-
-        case AMDGPU::SI_KILL:
-          if (Depth == 0)
-            SkipIfDead(MI);
-          else
-            HaveKill = true;
-          Kill(MI);
-          break;
-
-        case AMDGPU::S_BRANCH:
-          Branch(MI);
-          break;
-
-        case AMDGPU::SI_INDIRECT_SRC:
-          IndirectSrc(MI);
-          break;
-
-        case AMDGPU::SI_INDIRECT_DST_V1:
-        case AMDGPU::SI_INDIRECT_DST_V2:
-        case AMDGPU::SI_INDIRECT_DST_V4:
-        case AMDGPU::SI_INDIRECT_DST_V8:
-        case AMDGPU::SI_INDIRECT_DST_V16:
-          IndirectDst(MI);
-          break;
-      }
-    }
-  }
-
-  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
-    MachineBasicBlock &MBB = MF.front();
-    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
-            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
-  }
-
-  // FIXME: This seems inappropriate to do here.
-  if (NeedFlat && MFI->IsKernel) {
-    // Insert the prologue initializing the SGPRs pointing to the scratch space
-    // for flat accesses.
-    const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-
-    // TODO: What to use with function calls?
-
-    // FIXME: This is reporting stack size that is used in a scratch buffer
-    // rather than registers as well.
-    uint64_t StackSizeBytes = FrameInfo->getStackSize();
-
-    int IndirectBegin
-      = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
-    // Convert register index to 256-byte unit.
-    uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
-
-    assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
-           "Stack limits should be smaller than 16-bits");
-
-    // Initialize the flat scratch register pair.
-    // TODO: Can we use one s_mov_b64 here?
-
-    // Offset is in units of 256-bytes.
-    MachineBasicBlock &MBB = MF.front();
-    DebugLoc NoDL;
-    MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
-    const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
-
-    assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
-
-    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
-      .addImm(StackOffset);
-
-    // Documentation says size is "per-thread scratch size in bytes"
-    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
-      .addImm(StackSizeBytes);
-  }
-
-  return true;
-}
diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp
deleted file mode 100644
index 67421e231d8..00000000000
--- a/lib/Target/R600/SILowerI1Copies.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// i1 values are usually inserted by the CFG Structurize pass and they are
-/// unique in that they can be copied from VALU to SALU registers.
-/// This is not possible for any other value type.  Since there are no
-/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
-///
-//===----------------------------------------------------------------------===//
-//
-
-#define DEBUG_TYPE "si-i1-copies"
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetMachine.h"
-
-using namespace llvm;
-
-namespace {
-
-class SILowerI1Copies : public MachineFunctionPass {
-public:
-  static char ID;
-
-public:
-  SILowerI1Copies() : MachineFunctionPass(ID) {
-    initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI Lower i1 Copies";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineDominatorTree>();
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE,
-                      "SI Lower i1 Copies", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE,
-                    "SI Lower i1 Copies", false, false)
-
-char SILowerI1Copies::ID = 0;
-
-char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;
-
-FunctionPass *llvm::createSILowerI1CopiesPass() {
-  return new SILowerI1Copies();
-}
-
-bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  std::vector<unsigned> I1Defs;
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-                                                  BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
-    MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
-      Next = std::next(I);
-      MachineInstr &MI = *I;
-
-      if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
-        unsigned Reg = MI.getOperand(0).getReg();
-        const TargetRegisterClass *RC = MRI.getRegClass(Reg);
-        if (RC == &AMDGPU::VReg_1RegClass)
-          MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass);
-        continue;
-      }
-
-      if (MI.getOpcode() != AMDGPU::COPY)
-        continue;
-
-      const MachineOperand &Dst = MI.getOperand(0);
-      const MachineOperand &Src = MI.getOperand(1);
-
-      if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
-          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
-        continue;
-
-      const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
-      const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
-
-      if (DstRC == &AMDGPU::VReg_1RegClass &&
-          TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
-        I1Defs.push_back(Dst.getReg());
-        DebugLoc DL = MI.getDebugLoc();
-
-        MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
-        if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
-          if (DefInst->getOperand(1).isImm()) {
-            I1Defs.push_back(Dst.getReg());
-
-            int64_t Val = DefInst->getOperand(1).getImm();
-            assert(Val == 0 || Val == -1);
-
-            BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
-              .addOperand(Dst)
-              .addImm(Val);
-            MI.eraseFromParent();
-            continue;
-          }
-        }
-
-        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
-          .addOperand(Dst)
-          .addImm(0)
-          .addImm(-1)
-          .addOperand(Src);
-        MI.eraseFromParent();
-      } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
-                 SrcRC == &AMDGPU::VReg_1RegClass) {
-        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
-          .addOperand(Dst)
-          .addOperand(Src)
-          .addImm(0);
-        MI.eraseFromParent();
-      }
-    }
-  }
-
-  for (unsigned Reg : I1Defs)
-    MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);
-
-  return false;
-}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
deleted file mode 100644
index 587ea63d679..00000000000
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//===----------------------------------------------------------------------===//
-
-
-#include "SIMachineFunctionInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-
-#define MAX_LANES 64
-
-using namespace llvm;
-
-
-// Pin the vtable to this file.
-void SIMachineFunctionInfo::anchor() {}
-
-SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
-  : AMDGPUMachineFunction(MF),
-    TIDReg(AMDGPU::NoRegister),
-    HasSpilledVGPRs(false),
-    PSInputAddr(0),
-    NumUserSGPRs(0),
-    LDSWaveSpillSize(0) { }
-
-SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
-                                                       MachineFunction *MF,
-                                                       unsigned FrameIndex,
-                                                       unsigned SubIdx) {
-  const MachineFrameInfo *FrameInfo = MF->getFrameInfo();
-  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
-      MF->getSubtarget<AMDGPUSubtarget>().getRegisterInfo());
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-  int64_t Offset = FrameInfo->getObjectOffset(FrameIndex);
-  Offset += SubIdx * 4;
-
-  unsigned LaneVGPRIdx = Offset / (64 * 4);
-  unsigned Lane = (Offset / 4) % 64;
-
-  struct SpilledReg Spill;
-
-  if (!LaneVGPRs.count(LaneVGPRIdx)) {
-    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
-    LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
-    MRI.setPhysRegUsed(LaneVGPR);
-
-    // Add this register as live-in to all blocks to avoid machine verifer
-    // complaining about use of an undefined physical register.
-    for (MachineFunction::iterator BI = MF->begin(), BE = MF->end();
-         BI != BE; ++BI) {
-      BI->addLiveIn(LaneVGPR);
-    }
-  }
-
-  Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
-  Spill.Lane = Lane;
-  return Spill;
-}
-
-unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
-                                              const MachineFunction &MF) const {
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-  // FIXME: We should get this information from kernel attributes if it
-  // is available.
-  return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize();
-}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
deleted file mode 100644
index 667da4c8af6..00000000000
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ /dev/null
@@ -1,66 +0,0 @@
-//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H
-#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H
-
-#include "AMDGPUMachineFunction.h"
-#include "SIRegisterInfo.h"
-#include <map>
-
-namespace llvm {
-
-class MachineRegisterInfo;
-
-/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
-/// tells the hardware which interpolation parameters to load.
-class SIMachineFunctionInfo : public AMDGPUMachineFunction {
-  void anchor() override;
-
-  unsigned TIDReg;
-  bool HasSpilledVGPRs;
-
-public:
-
-  struct SpilledReg {
-    unsigned VGPR;
-    int Lane;
-    SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { }
-    SpilledReg() : VGPR(0), Lane(-1) { }
-    bool hasLane() { return Lane != -1;}
-  };
-
-  // SIMachineFunctionInfo definition
-
-  SIMachineFunctionInfo(const MachineFunction &MF);
-  SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex,
-                           unsigned SubIdx);
-  unsigned PSInputAddr;
-  unsigned NumUserSGPRs;
-  std::map<unsigned, unsigned> LaneVGPRs;
-  unsigned LDSWaveSpillSize;
-  unsigned ScratchOffsetReg;
-  bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
-  unsigned getTIDReg() const { return TIDReg; };
-  void setTIDReg(unsigned Reg) { TIDReg = Reg; }
-  bool hasSpilledVGPRs() const { return HasSpilledVGPRs; }
-  void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; }
-
-  unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
-};
-
-} // End namespace llvm
-
-
-#endif
diff --git a/lib/Target/R600/SIPrepareScratchRegs.cpp b/lib/Target/R600/SIPrepareScratchRegs.cpp
deleted file mode 100644
index 0a7f684552f..00000000000
--- a/lib/Target/R600/SIPrepareScratchRegs.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// This pass loads scratch pointer and scratch offset into a register or a
-/// frame index which can be used anywhere in the program.  These values will
-/// be used for spilling VGPRs.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-
-using namespace llvm;
-
-namespace {
-
-class SIPrepareScratchRegs : public MachineFunctionPass {
-
-private:
-  static char ID;
-
-public:
-  SIPrepareScratchRegs() : MachineFunctionPass(ID) { }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI prepare scratch registers";
-  }
-
-};
-
-} // End anonymous namespace
-
-char SIPrepareScratchRegs::ID = 0;
-
-FunctionPass *llvm::createSIPrepareScratchRegs() {
-  return new SIPrepareScratchRegs();
-}
-
-bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-  MachineBasicBlock *Entry = MF.begin();
-  MachineBasicBlock::iterator I = Entry->begin();
-  DebugLoc DL = I->getDebugLoc();
-
-  // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to
-  // run this pass.
-  if (!MFI->hasSpilledVGPRs())
-    return false;
-
-  unsigned ScratchPtrPreloadReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
-  unsigned ScratchOffsetPreloadReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
-
-  if (!Entry->isLiveIn(ScratchPtrPreloadReg))
-    Entry->addLiveIn(ScratchPtrPreloadReg);
-
-  if (!Entry->isLiveIn(ScratchOffsetPreloadReg))
-    Entry->addLiveIn(ScratchOffsetPreloadReg);
-
-  // Load the scratch offset.
-  unsigned ScratchOffsetReg =
-      TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass);
-  int ScratchOffsetFI = -1;
-
-  if (ScratchOffsetReg != AMDGPU::NoRegister) {
-    // Found an SGPR to use
-    MRI.setPhysRegUsed(ScratchOffsetReg);
-    BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
-            .addReg(ScratchOffsetPreloadReg);
-  } else {
-    // No SGPR is available, we must spill.
-    ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4);
-    BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE))
-            .addReg(ScratchOffsetPreloadReg)
-            .addFrameIndex(ScratchOffsetFI)
-            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-            .addReg(AMDGPU::SGPR0, RegState::Undef);
-  }
-
-
-  // Now that we have the scratch pointer and offset values, we need to
-  // add them to all the SI_SPILL_V* instructions.
-
-  RegScavenger RS;
-  unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4);
-  RS.addScavengingFrameIndex(ScratchRsrcFI);
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
-    // Add the scratch offset reg as a live-in so that the register scavenger
-    // doesn't re-use it.
-    if (!MBB.isLiveIn(ScratchOffsetReg) &&
-        ScratchOffsetReg != AMDGPU::NoRegister)
-      MBB.addLiveIn(ScratchOffsetReg);
-    RS.enterBasicBlock(&MBB);
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      MachineInstr &MI = *I;
-      RS.forward(I);
-      DebugLoc DL = MI.getDebugLoc();
-      if (!TII->isVGPRSpill(MI.getOpcode()))
-        continue;
-
-      // Scratch resource
-      unsigned ScratchRsrcReg =
-          RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0);
-
-      uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
-                      0xffffffff; // Size
-
-      unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
-      unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
-      unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
-      unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
-              .addExternalSymbol("SCRATCH_RSRC_DWORD0")
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
-              .addExternalSymbol("SCRATCH_RSRC_DWORD1")
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
-              .addImm(Rsrc & 0xffffffff)
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
-              .addImm(Rsrc >> 32)
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      // Scratch Offset
-      if (ScratchOffsetReg == AMDGPU::NoRegister) {
-        ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE),
-                ScratchOffsetReg)
-                .addFrameIndex(ScratchOffsetFI)
-                .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-                .addReg(AMDGPU::SGPR0, RegState::Undef);
-      } else if (!MBB.isLiveIn(ScratchOffsetReg)) {
-        MBB.addLiveIn(ScratchOffsetReg);
-      }
-
-      if (ScratchRsrcReg == AMDGPU::NoRegister ||
-          ScratchOffsetReg == AMDGPU::NoRegister) {
-        LLVMContext &Ctx = MF.getFunction()->getContext();
-        Ctx.emitError("ran out of SGPRs for spilling VGPRs");
-        ScratchRsrcReg = AMDGPU::SGPR0;
-        ScratchOffsetReg = AMDGPU::SGPR0;
-      }
-      MI.getOperand(2).setReg(ScratchRsrcReg);
-      MI.getOperand(2).setIsKill(true);
-      MI.getOperand(2).setIsUndef(false);
-      MI.getOperand(3).setReg(ScratchOffsetReg);
-      MI.getOperand(3).setIsUndef(false);
-      MI.getOperand(3).setIsKill(false);
-      MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true));
-    }
-  }
-  return true;
-}
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
deleted file mode 100644
index db2ff0b1f95..00000000000
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ /dev/null
@@ -1,543 +0,0 @@
-//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief SI implementation of the TargetRegisterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "SIRegisterInfo.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-
-using namespace llvm;
-
-SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {}
-
-BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  BitVector Reserved(getNumRegs());
-  Reserved.set(AMDGPU::EXEC);
-
-  // EXEC_LO and EXEC_HI could be allocated and used as regular register,
-  // but this seems likely to result in bugs, so I'm marking them as reserved.
-  Reserved.set(AMDGPU::EXEC_LO);
-  Reserved.set(AMDGPU::EXEC_HI);
-
-  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
-  Reserved.set(AMDGPU::FLAT_SCR);
-  Reserved.set(AMDGPU::FLAT_SCR_LO);
-  Reserved.set(AMDGPU::FLAT_SCR_HI);
-
-  // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs
-  Reserved.set(AMDGPU::VGPR255);
-  Reserved.set(AMDGPU::VGPR254);
-
-  // Tonga and Iceland can only allocate a fixed number of SGPRs due
-  // to a hw bug.
-  if (MF.getSubtarget<AMDGPUSubtarget>().hasSGPRInitBug()) {
-    unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
-    // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs).
-    // Assume XNACK_MASK is unused.
-    unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4;
-
-    for (unsigned i = Limit; i < NumSGPRs; ++i) {
-      unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
-      MCRegAliasIterator R = MCRegAliasIterator(Reg, this, true);
-
-      for (; R.isValid(); ++R)
-        Reserved.set(*R);
-    }
-  }
-
-  return Reserved;
-}
-
-unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
-                                                unsigned Idx) const {
-
-  const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>();
-  // FIXME: We should adjust the max number of waves based on LDS size.
-  unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(),
-                                          STI.getMaxWavesPerCU());
-  unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU());
-
-  for (regclass_iterator I = regclass_begin(), E = regclass_end();
-       I != E; ++I) {
-
-    unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1);
-    unsigned Limit;
-
-    if (isSGPRClass(*I)) {
-      Limit = SGPRLimit / NumSubRegs;
-    } else {
-      Limit = VGPRLimit / NumSubRegs;
-    }
-
-    const int *Sets = getRegClassPressureSets(*I);
-    assert(Sets);
-    for (unsigned i = 0; Sets[i] != -1; ++i) {
-	    if (Sets[i] == (int)Idx)
-        return Limit;
-    }
-  }
-  return 256;
-}
-
-bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
-  return Fn.getFrameInfo()->hasStackObjects();
-}
-
-static unsigned getNumSubRegsForSpillOp(unsigned Op) {
-
-  switch (Op) {
-  case AMDGPU::SI_SPILL_S512_SAVE:
-  case AMDGPU::SI_SPILL_S512_RESTORE:
-  case AMDGPU::SI_SPILL_V512_SAVE:
-  case AMDGPU::SI_SPILL_V512_RESTORE:
-    return 16;
-  case AMDGPU::SI_SPILL_S256_SAVE:
-  case AMDGPU::SI_SPILL_S256_RESTORE:
-  case AMDGPU::SI_SPILL_V256_SAVE:
-  case AMDGPU::SI_SPILL_V256_RESTORE:
-    return 8;
-  case AMDGPU::SI_SPILL_S128_SAVE:
-  case AMDGPU::SI_SPILL_S128_RESTORE:
-  case AMDGPU::SI_SPILL_V128_SAVE:
-  case AMDGPU::SI_SPILL_V128_RESTORE:
-    return 4;
-  case AMDGPU::SI_SPILL_V96_SAVE:
-  case AMDGPU::SI_SPILL_V96_RESTORE:
-    return 3;
-  case AMDGPU::SI_SPILL_S64_SAVE:
-  case AMDGPU::SI_SPILL_S64_RESTORE:
-  case AMDGPU::SI_SPILL_V64_SAVE:
-  case AMDGPU::SI_SPILL_V64_RESTORE:
-    return 2;
-  case AMDGPU::SI_SPILL_S32_SAVE:
-  case AMDGPU::SI_SPILL_S32_RESTORE:
-  case AMDGPU::SI_SPILL_V32_SAVE:
-  case AMDGPU::SI_SPILL_V32_RESTORE:
-    return 1;
-  default: llvm_unreachable("Invalid spill opcode");
-  }
-}
-
-void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
-                                           unsigned LoadStoreOp,
-                                           unsigned Value,
-                                           unsigned ScratchRsrcReg,
-                                           unsigned ScratchOffset,
-                                           int64_t Offset,
-                                           RegScavenger *RS) const {
-
-  MachineBasicBlock *MBB = MI->getParent();
-  const MachineFunction *MF = MI->getParent()->getParent();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
-  LLVMContext &Ctx = MF->getFunction()->getContext();
-  DebugLoc DL = MI->getDebugLoc();
-  bool IsLoad = TII->get(LoadStoreOp).mayLoad();
-
-  bool RanOutOfSGPRs = false;
-  unsigned SOffset = ScratchOffset;
-
-  unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
-  unsigned Size = NumSubRegs * 4;
-
-  if (!isUInt<12>(Offset + Size)) {
-    SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
-    if (SOffset == AMDGPU::NoRegister) {
-      RanOutOfSGPRs = true;
-      SOffset = AMDGPU::SGPR0;
-    }
-    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
-            .addReg(ScratchOffset)
-            .addImm(Offset);
-    Offset = 0;
-  }
-
-  if (RanOutOfSGPRs)
-    Ctx.emitError("Ran out of SGPRs for spilling VGPRS");
-
-  for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
-    unsigned SubReg = NumSubRegs > 1 ?
-        getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
-        Value;
-    bool IsKill = (i == e - 1);
-
-    BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
-            .addReg(SubReg, getDefRegState(IsLoad))
-            .addReg(ScratchRsrcReg, getKillRegState(IsKill))
-            .addReg(SOffset)
-            .addImm(Offset)
-            .addImm(0) // glc
-            .addImm(0) // slc
-            .addImm(0) // tfe
-            .addReg(Value, RegState::Implicit | getDefRegState(IsLoad));
-  }
-}
-
-void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
-                                        int SPAdj, unsigned FIOperandNum,
-                                        RegScavenger *RS) const {
-  MachineFunction *MF = MI->getParent()->getParent();
-  MachineBasicBlock *MBB = MI->getParent();
-  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  MachineFrameInfo *FrameInfo = MF->getFrameInfo();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
-  DebugLoc DL = MI->getDebugLoc();
-
-  MachineOperand &FIOp = MI->getOperand(FIOperandNum);
-  int Index = MI->getOperand(FIOperandNum).getIndex();
-
-  switch (MI->getOpcode()) {
-    // SGPR register spill
-    case AMDGPU::SI_SPILL_S512_SAVE:
-    case AMDGPU::SI_SPILL_S256_SAVE:
-    case AMDGPU::SI_SPILL_S128_SAVE:
-    case AMDGPU::SI_SPILL_S64_SAVE:
-    case AMDGPU::SI_SPILL_S32_SAVE: {
-      unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
-
-      for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
-        unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
-                                           &AMDGPU::SGPR_32RegClass, i);
-        struct SIMachineFunctionInfo::SpilledReg Spill =
-            MFI->getSpilledReg(MF, Index, i);
-
-        if (Spill.VGPR == AMDGPU::NoRegister) {
-           LLVMContext &Ctx = MF->getFunction()->getContext();
-           Ctx.emitError("Ran out of VGPRs for spilling SGPR");
-        }
-
-        BuildMI(*MBB, MI, DL,
-                TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
-                Spill.VGPR)
-                .addReg(SubReg)
-                .addImm(Spill.Lane);
-
-      }
-      MI->eraseFromParent();
-      break;
-    }
-
-    // SGPR register restore
-    case AMDGPU::SI_SPILL_S512_RESTORE:
-    case AMDGPU::SI_SPILL_S256_RESTORE:
-    case AMDGPU::SI_SPILL_S128_RESTORE:
-    case AMDGPU::SI_SPILL_S64_RESTORE:
-    case AMDGPU::SI_SPILL_S32_RESTORE: {
-      unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
-
-      for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
-        unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
-                                           &AMDGPU::SGPR_32RegClass, i);
-        struct SIMachineFunctionInfo::SpilledReg Spill =
-            MFI->getSpilledReg(MF, Index, i);
-
-        if (Spill.VGPR == AMDGPU::NoRegister) {
-           LLVMContext &Ctx = MF->getFunction()->getContext();
-           Ctx.emitError("Ran out of VGPRs for spilling SGPR");
-        }
-
-        BuildMI(*MBB, MI, DL,
-                TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
-                SubReg)
-                .addReg(Spill.VGPR)
-                .addImm(Spill.Lane)
-                .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
-      }
-
-      // TODO: only do this when it is needed
-      switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) {
-      case AMDGPUSubtarget::SOUTHERN_ISLANDS:
-        // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI
-        TII->insertNOPs(MI, 3);
-        break;
-      case AMDGPUSubtarget::SEA_ISLANDS:
-        break;
-      default: // VOLCANIC_ISLANDS and later
-        // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI
-        // and later. This also applies to VALUs which write VCC, but we're
-        // unlikely to see VMEM use VCC.
-        TII->insertNOPs(MI, 4);
-      }
-
-      MI->eraseFromParent();
-      break;
-    }
-
-    // VGPR register spill
-    case AMDGPU::SI_SPILL_V512_SAVE:
-    case AMDGPU::SI_SPILL_V256_SAVE:
-    case AMDGPU::SI_SPILL_V128_SAVE:
-    case AMDGPU::SI_SPILL_V96_SAVE:
-    case AMDGPU::SI_SPILL_V64_SAVE:
-    case AMDGPU::SI_SPILL_V32_SAVE:
-      buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
-            TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(),
-            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
-            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
-             FrameInfo->getObjectOffset(Index), RS);
-      MI->eraseFromParent();
-      break;
-    case AMDGPU::SI_SPILL_V32_RESTORE:
-    case AMDGPU::SI_SPILL_V64_RESTORE:
-    case AMDGPU::SI_SPILL_V96_RESTORE:
-    case AMDGPU::SI_SPILL_V128_RESTORE:
-    case AMDGPU::SI_SPILL_V256_RESTORE:
-    case AMDGPU::SI_SPILL_V512_RESTORE: {
-      buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
-            TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(),
-            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
-            TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
-            FrameInfo->getObjectOffset(Index), RS);
-      MI->eraseFromParent();
-      break;
-    }
-
-    default: {
-      int64_t Offset = FrameInfo->getObjectOffset(Index);
-      FIOp.ChangeToImmediate(Offset);
-      if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
-        unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj);
-        BuildMI(*MBB, MI, MI->getDebugLoc(),
-                TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
-                .addImm(Offset);
-        FIOp.ChangeToRegister(TmpReg, false, false, true);
-      }
-    }
-  }
-}
-
-const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
-                                                                   MVT VT) const {
-  switch(VT.SimpleTy) {
-    default:
-    case MVT::i32: return &AMDGPU::VGPR_32RegClass;
-  }
-}
-
-unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
-  return getEncodingValue(Reg) & 0xff;
-}
-
-const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
-  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
-
-  static const TargetRegisterClass *BaseClasses[] = {
-    &AMDGPU::VGPR_32RegClass,
-    &AMDGPU::SReg_32RegClass,
-    &AMDGPU::VReg_64RegClass,
-    &AMDGPU::SReg_64RegClass,
-    &AMDGPU::VReg_96RegClass,
-    &AMDGPU::VReg_128RegClass,
-    &AMDGPU::SReg_128RegClass,
-    &AMDGPU::VReg_256RegClass,
-    &AMDGPU::SReg_256RegClass,
-    &AMDGPU::VReg_512RegClass
-  };
-
-  for (const TargetRegisterClass *BaseClass : BaseClasses) {
-    if (BaseClass->contains(Reg)) {
-      return BaseClass;
-    }
-  }
-  return nullptr;
-}
-
-bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
-  return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_512RegClass, RC);
-}
-
-const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
-                                         const TargetRegisterClass *SRC) const {
-    if (hasVGPRs(SRC)) {
-      return SRC;
-    } else if (SRC == &AMDGPU::SCCRegRegClass) {
-      return &AMDGPU::VCCRegRegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) {
-      return &AMDGPU::VGPR_32RegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) {
-      return &AMDGPU::VReg_64RegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) {
-      return &AMDGPU::VReg_128RegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) {
-      return &AMDGPU::VReg_256RegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
-      return &AMDGPU::VReg_512RegClass;
-    }
-    return nullptr;
-}
-
-const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
-                         const TargetRegisterClass *RC, unsigned SubIdx) const {
-  if (SubIdx == AMDGPU::NoSubRegister)
-    return RC;
-
-  // If this register has a sub-register, we can safely assume it is a 32-bit
-  // register, because all of SI's sub-registers are 32-bit.
-  if (isSGPRClass(RC)) {
-    return &AMDGPU::SGPR_32RegClass;
-  } else {
-    return &AMDGPU::VGPR_32RegClass;
-  }
-}
-
-unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
-                                          const TargetRegisterClass *SubRC,
-                                          unsigned Channel) const {
-
-  switch (Reg) {
-    case AMDGPU::VCC:
-      switch(Channel) {
-        case 0: return AMDGPU::VCC_LO;
-        case 1: return AMDGPU::VCC_HI;
-        default: llvm_unreachable("Invalid SubIdx for VCC");
-      }
-
-  case AMDGPU::FLAT_SCR:
-    switch (Channel) {
-    case 0:
-      return AMDGPU::FLAT_SCR_LO;
-    case 1:
-      return AMDGPU::FLAT_SCR_HI;
-    default:
-      llvm_unreachable("Invalid SubIdx for FLAT_SCR");
-    }
-    break;
-
-  case AMDGPU::EXEC:
-    switch (Channel) {
-    case 0:
-      return AMDGPU::EXEC_LO;
-    case 1:
-      return AMDGPU::EXEC_HI;
-    default:
-      llvm_unreachable("Invalid SubIdx for EXEC");
-    }
-    break;
-  }
-
-  const TargetRegisterClass *RC = getPhysRegClass(Reg);
-  // 32-bit registers don't have sub-registers, so we can just return the
-  // Reg.  We need to have this check here, because the calculation below
-  // using getHWRegIndex() will fail with special 32-bit registers like
-  // VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0.
-  if (RC->getSize() == 4) {
-    assert(Channel == 0);
-    return Reg;
-  }
-
-  unsigned Index = getHWRegIndex(Reg);
-  return SubRC->getRegister(Index + Channel);
-}
-
-bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
-  return OpType == AMDGPU::OPERAND_REG_IMM32;
-}
-
-bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
-  if (opCanUseLiteralConstant(OpType))
-    return true;
-
-  return OpType == AMDGPU::OPERAND_REG_INLINE_C;
-}
-
-unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
-                                           enum PreloadedValue Value) const {
-
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  switch (Value) {
-  case SIRegisterInfo::TGID_X:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
-  case SIRegisterInfo::TGID_Y:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
-  case SIRegisterInfo::TGID_Z:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
-  case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
-    if (MFI->getShaderType() != ShaderType::COMPUTE)
-      return MFI->ScratchOffsetReg;
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
-  case SIRegisterInfo::SCRATCH_PTR:
-    return AMDGPU::SGPR2_SGPR3;
-  case SIRegisterInfo::INPUT_PTR:
-    return AMDGPU::SGPR0_SGPR1;
-  case SIRegisterInfo::TIDIG_X:
-    return AMDGPU::VGPR0;
-  case SIRegisterInfo::TIDIG_Y:
-    return AMDGPU::VGPR1;
-  case SIRegisterInfo::TIDIG_Z:
-    return AMDGPU::VGPR2;
-  }
-  llvm_unreachable("unexpected preloaded value type");
-}
-
-/// \brief Returns a register that is not used at any point in the function.
-///        If all registers are used, then this function will return
-//         AMDGPU::NoRegister.
-unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
-                                           const TargetRegisterClass *RC) const {
-
-  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
-       I != E; ++I) {
-    if (!MRI.isPhysRegUsed(*I))
-      return *I;
-  }
-  return AMDGPU::NoRegister;
-}
-
-unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
-  switch(WaveCount) {
-    case 10: return 24;
-    case 9:  return 28;
-    case 8:  return 32;
-    case 7:  return 36;
-    case 6:  return 40;
-    case 5:  return 48;
-    case 4:  return 64;
-    case 3:  return 84;
-    case 2:  return 128;
-    default: return 256;
-  }
-}
-
-unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
-                                            unsigned WaveCount) const {
-  if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    switch (WaveCount) {
-      case 10: return 80;
-      case 9:  return 80;
-      case 8:  return 96;
-      default: return 102;
-    }
-  } else {
-    switch(WaveCount) {
-      case 10: return 48;
-      case 9:  return 56;
-      case 8:  return 64;
-      case 7:  return 72;
-      case 6:  return 80;
-      case 5:  return 96;
-      default: return 103;
-    }
-  }
-}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
deleted file mode 100644
index bfdb67c5e12..00000000000
--- a/lib/Target/R600/SIRegisterInfo.h
+++ /dev/null
@@ -1,131 +0,0 @@
-//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Interface definition for SIRegisterInfo
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
-#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
-
-#include "AMDGPURegisterInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/Support/Debug.h"
-
-namespace llvm {
-
-struct SIRegisterInfo : public AMDGPURegisterInfo {
-
-  SIRegisterInfo();
-
-  BitVector getReservedRegs(const MachineFunction &MF) const override;
-
-  unsigned getRegPressureSetLimit(const MachineFunction &MF,
-                                  unsigned Idx) const override;
-
-  bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
-
-  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
-                           unsigned FIOperandNum,
-                           RegScavenger *RS) const override;
-
-  /// \brief get the register class of the specified type to use in the
-  /// CFGStructurizer
-  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
-
-  unsigned getHWRegIndex(unsigned Reg) const override;
-
-  /// \brief Return the 'base' register class for this register.
-  /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
-  const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
-
-  /// \returns true if this class contains only SGPR registers
-  bool isSGPRClass(const TargetRegisterClass *RC) const {
-    if (!RC)
-      return false;
-
-    return !hasVGPRs(RC);
-  }
-
-  /// \returns true if this class ID contains only SGPR registers
-  bool isSGPRClassID(unsigned RCID) const {
-    if (static_cast<int>(RCID) == -1)
-      return false;
-
-    return isSGPRClass(getRegClass(RCID));
-  }
-
-  /// \returns true if this class contains VGPR registers.
-  bool hasVGPRs(const TargetRegisterClass *RC) const;
-
-  /// \returns A VGPR reg class with the same width as \p SRC
-  const TargetRegisterClass *getEquivalentVGPRClass(
-                                          const TargetRegisterClass *SRC) const;
-
-  /// \returns The register class that is used for a sub-register of \p RC for
-  /// the given \p SubIdx.  If \p SubIdx equals NoSubRegister, \p RC will
-  /// be returned.
-  const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
-                                            unsigned SubIdx) const;
-
-  /// \p Channel This is the register channel (e.g. a value from 0-16), not the
-  ///            SubReg index.
-  /// \returns The sub-register of Reg that is in Channel.
-  unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC,
-                            unsigned Channel) const;
-
-  /// \returns True if operands defined with this operand type can accept
-  /// a literal constant (i.e. any 32-bit immediate).
-  bool opCanUseLiteralConstant(unsigned OpType) const;
-
-  /// \returns True if operands defined with this operand type can accept
-  /// an inline constant. i.e. An integer value in the range (-16, 64) or
-  /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. 
-  bool opCanUseInlineConstant(unsigned OpType) const;
-
-  enum PreloadedValue {
-    TGID_X,
-    TGID_Y,
-    TGID_Z,
-    SCRATCH_WAVE_OFFSET,
-    SCRATCH_PTR,
-    INPUT_PTR,
-    TIDIG_X,
-    TIDIG_Y,
-    TIDIG_Z
-  };
-
-  /// \brief Returns the physical register that \p Value is stored in.
-  unsigned getPreloadedValue(const MachineFunction &MF,
-                             enum PreloadedValue Value) const;
-
-  /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount
-  ///        concurrent waves.
-  unsigned getNumVGPRsAllowed(unsigned WaveCount) const;
-
-  /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
-  ///        concurrent waves.
-  unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
-                              unsigned WaveCount) const;
-
-  unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
-                              const TargetRegisterClass *RC) const;
-
-private:
-  void buildScratchLoadStore(MachineBasicBlock::iterator MI,
-                             unsigned LoadStoreOp, unsigned Value,
-                             unsigned ScratchRsrcReg, unsigned ScratchOffset,
-                             int64_t Offset, RegScavenger *RS) const;
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
deleted file mode 100644
index 2a9017fa2a9..00000000000
--- a/lib/Target/R600/SIRegisterInfo.td
+++ /dev/null
@@ -1,284 +0,0 @@
-//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//  Declarations that describe the SI registers
-//===----------------------------------------------------------------------===//
-
-class SIReg <string n, bits<16> encoding = 0> : Register<n> {
-  let Namespace = "AMDGPU";
-  let HWEncoding = encoding;
-}
-
-// Special Registers
-def VCC_LO : SIReg<"vcc_lo", 106>;
-def VCC_HI : SIReg<"vcc_hi", 107>;
-
-// VCC for 64-bit instructions
-def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
-  let Namespace = "AMDGPU";
-  let SubRegIndices = [sub0, sub1];
-  let HWEncoding = 106;
-}
-
-def EXEC_LO : SIReg<"exec_lo", 126>;
-def EXEC_HI : SIReg<"exec_hi", 127>;
-
-def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
-  let Namespace = "AMDGPU";
-  let SubRegIndices = [sub0, sub1];
-  let HWEncoding = 126;
-}
-
-def SCC : SIReg<"scc", 253>;
-def M0 : SIReg <"m0", 124>;
-
-def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes.
-def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes.
-
-// Pair to indicate location of scratch space for flat accesses.
-def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> {
-  let Namespace = "AMDGPU";
-  let SubRegIndices = [sub0, sub1];
-  let HWEncoding = 104;
-}
-
-// SGPR registers
-foreach Index = 0-101 in {
-  def SGPR#Index : SIReg <"SGPR"#Index, Index>;
-}
-
-// VGPR registers
-foreach Index = 0-255 in {
-  def VGPR#Index : SIReg <"VGPR"#Index, Index> {
-    let HWEncoding{8} = 1;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-//  Groupings using register classes and tuples
-//===----------------------------------------------------------------------===//
-
-// SGPR 32-bit registers
-def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
-                            (add (sequence "SGPR%u", 0, 101))>;
-
-// SGPR 64-bit registers
-def SGPR_64Regs : RegisterTuples<[sub0, sub1],
-                             [(add (decimate (trunc SGPR_32, 101), 2)),
-                              (add (decimate (shl SGPR_32, 1), 2))]>;
-
-// SGPR 128-bit registers
-def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
-                              [(add (decimate (trunc SGPR_32, 99), 4)),
-                               (add (decimate (shl SGPR_32, 1), 4)),
-                               (add (decimate (shl SGPR_32, 2), 4)),
-                               (add (decimate (shl SGPR_32, 3), 4))]>;
-
-// SGPR 256-bit registers
-def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
-                              [(add (decimate (trunc SGPR_32, 95), 4)),
-                               (add (decimate (shl SGPR_32, 1), 4)),
-                               (add (decimate (shl SGPR_32, 2), 4)),
-                               (add (decimate (shl SGPR_32, 3), 4)),
-                               (add (decimate (shl SGPR_32, 4), 4)),
-                               (add (decimate (shl SGPR_32, 5), 4)),
-                               (add (decimate (shl SGPR_32, 6), 4)),
-                               (add (decimate (shl SGPR_32, 7), 4))]>;
-
-// SGPR 512-bit registers
-def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
-                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
-                              [(add (decimate (trunc SGPR_32, 87), 4)),
-                               (add (decimate (shl SGPR_32, 1), 4)),
-                               (add (decimate (shl SGPR_32, 2), 4)),
-                               (add (decimate (shl SGPR_32, 3), 4)),
-                               (add (decimate (shl SGPR_32, 4), 4)),
-                               (add (decimate (shl SGPR_32, 5), 4)),
-                               (add (decimate (shl SGPR_32, 6), 4)),
-                               (add (decimate (shl SGPR_32, 7), 4)),
-                               (add (decimate (shl SGPR_32, 8), 4)),
-                               (add (decimate (shl SGPR_32, 9), 4)),
-                               (add (decimate (shl SGPR_32, 10), 4)),
-                               (add (decimate (shl SGPR_32, 11), 4)),
-                               (add (decimate (shl SGPR_32, 12), 4)),
-                               (add (decimate (shl SGPR_32, 13), 4)),
-                               (add (decimate (shl SGPR_32, 14), 4)),
-                               (add (decimate (shl SGPR_32, 15), 4))]>;
-
-// VGPR 32-bit registers
-def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
-                            (add (sequence "VGPR%u", 0, 255))>;
-
-// VGPR 64-bit registers
-def VGPR_64 : RegisterTuples<[sub0, sub1],
-                             [(add (trunc VGPR_32, 255)),
-                              (add (shl VGPR_32, 1))]>;
-
-// VGPR 96-bit registers
-def VGPR_96 : RegisterTuples<[sub0, sub1, sub2],
-                             [(add (trunc VGPR_32, 254)),
-                              (add (shl VGPR_32, 1)),
-                              (add (shl VGPR_32, 2))]>;
-
-// VGPR 128-bit registers
-def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
-                              [(add (trunc VGPR_32, 253)),
-                               (add (shl VGPR_32, 1)),
-                               (add (shl VGPR_32, 2)),
-                               (add (shl VGPR_32, 3))]>;
-
-// VGPR 256-bit registers
-def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
-                              [(add (trunc VGPR_32, 249)),
-                               (add (shl VGPR_32, 1)),
-                               (add (shl VGPR_32, 2)),
-                               (add (shl VGPR_32, 3)),
-                               (add (shl VGPR_32, 4)),
-                               (add (shl VGPR_32, 5)),
-                               (add (shl VGPR_32, 6)),
-                               (add (shl VGPR_32, 7))]>;
-
-// VGPR 512-bit registers
-def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
-                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
-                              [(add (trunc VGPR_32, 241)),
-                               (add (shl VGPR_32, 1)),
-                               (add (shl VGPR_32, 2)),
-                               (add (shl VGPR_32, 3)),
-                               (add (shl VGPR_32, 4)),
-                               (add (shl VGPR_32, 5)),
-                               (add (shl VGPR_32, 6)),
-                               (add (shl VGPR_32, 7)),
-                               (add (shl VGPR_32, 8)),
-                               (add (shl VGPR_32, 9)),
-                               (add (shl VGPR_32, 10)),
-                               (add (shl VGPR_32, 11)),
-                               (add (shl VGPR_32, 12)),
-                               (add (shl VGPR_32, 13)),
-                               (add (shl VGPR_32, 14)),
-                               (add (shl VGPR_32, 15))]>;
-
-//===----------------------------------------------------------------------===//
-//  Register classes used as source and destination
-//===----------------------------------------------------------------------===//
-
-class RegImmMatcher<string name> : AsmOperandClass {
-  let Name = name;
-  let RenderMethod = "addRegOrImmOperands";
-}
-
-// Special register classes for predicates and the M0 register
-def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> {
-  let CopyCost = -1; // Theoretically it is possible to read from SCC,
-                     // but it should never be necessary.
-}
-
-def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>;
-def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>;
-
-// Register class for all scalar registers (SGPRs + Special Registers)
-def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
-  (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)
->;
-
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>;
-
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64,
-  (add SGPR_64, VCCReg, EXECReg, FLAT_SCR)
->;
-
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>;
-
-def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
-
-def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>;
-
-// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
-
-def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
-  let Size = 96;
-}
-
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
-
-def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>;
-
-def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
-
-def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
-  let Size = 32;
-}
-
-class RegImmOperand <RegisterClass rc> : RegisterOperand<rc> {
-  let OperandNamespace = "AMDGPU";
-  let OperandType = "OPERAND_REG_IMM32";
-}
-
-class RegInlineOperand <RegisterClass rc> : RegisterOperand<rc> {
-  let OperandNamespace = "AMDGPU";
-  let OperandType = "OPERAND_REG_INLINE_C";
-}
-
-//===----------------------------------------------------------------------===//
-//  SSrc_* Operands with an SGPR or a 32-bit immediate
-//===----------------------------------------------------------------------===//
-
-def SSrc_32 : RegImmOperand<SReg_32> {
-  let ParserMatchClass = RegImmMatcher<"SSrc32">;
-}
-
-def SSrc_64 : RegImmOperand<SReg_64> {
-  let ParserMatchClass = RegImmMatcher<"SSrc64">;
-}
-
-//===----------------------------------------------------------------------===//
-//  SCSrc_* Operands with an SGPR or a inline constant
-//===----------------------------------------------------------------------===//
-
-def SCSrc_32 : RegInlineOperand<SReg_32> {
-  let ParserMatchClass = RegImmMatcher<"SCSrc32">;
-}
-
-//===----------------------------------------------------------------------===//
-//  VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
-//===----------------------------------------------------------------------===//
-
-def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>;
-
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
-
-def VSrc_32 : RegisterOperand<VS_32> {
-  let OperandNamespace = "AMDGPU";
-  let OperandType = "OPERAND_REG_IMM32";
-  let ParserMatchClass = RegImmMatcher<"VSrc32">;
-}
-
-def VSrc_64 : RegisterOperand<VS_64> {
-  let OperandNamespace = "AMDGPU";
-  let OperandType = "OPERAND_REG_IMM32";
-  let ParserMatchClass = RegImmMatcher<"VSrc64">;
-}
-
-//===----------------------------------------------------------------------===//
-//  VCSrc_* Operands with an SGPR, VGPR or an inline constant
-//===----------------------------------------------------------------------===//
-
-def VCSrc_32 : RegisterOperand<VS_32> {
-  let OperandNamespace = "AMDGPU";
-  let OperandType = "OPERAND_REG_INLINE_C";
-  let ParserMatchClass = RegImmMatcher<"VCSrc32">;
-}
-
-def VCSrc_64 : RegisterOperand<VS_64> {
-  let OperandNamespace = "AMDGPU";
-  let OperandType = "OPERAND_REG_INLINE_C";
-  let ParserMatchClass = RegImmMatcher<"VCSrc64">;
-}
diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td
deleted file mode 100644
index 9b1f676020b..00000000000
--- a/lib/Target/R600/SISchedule.td
+++ /dev/null
@@ -1,91 +0,0 @@
-//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// MachineModel definitions for Southern Islands (SI)
-//
-//===----------------------------------------------------------------------===//
-
-def WriteBranch : SchedWrite;
-def WriteExport : SchedWrite;
-def WriteLDS    : SchedWrite;
-def WriteSALU   : SchedWrite;
-def WriteSMEM   : SchedWrite;
-def WriteVMEM   : SchedWrite;
-
-// Vector ALU instructions
-def Write32Bit         : SchedWrite;
-def WriteQuarterRate32 : SchedWrite;
-
-def WriteFloatFMA   : SchedWrite;
-
-def WriteDouble     : SchedWrite;
-def WriteDoubleAdd  : SchedWrite;
-
-def SIFullSpeedModel : SchedMachineModel;
-def SIQuarterSpeedModel : SchedMachineModel;
-
-// BufferSize = 0 means the processors are in-order.
-let BufferSize = 0 in {
-
-// XXX: Are the resource counts correct?
-def HWBranch : ProcResource<1>;
-def HWExport : ProcResource<7>;   // Taken from S_WAITCNT
-def HWLGKM   : ProcResource<31>;  // Taken from S_WAITCNT
-def HWSALU   : ProcResource<1>;
-def HWVMEM   : ProcResource<15>;  // Taken from S_WAITCNT
-def HWVALU   : ProcResource<1>;
-
-}
-
-class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
-                 int latency> : WriteRes<write, resources> {
-  let Latency = latency;
-}
-
-class HWVALUWriteRes<SchedWrite write, int latency> :
-  HWWriteRes<write, [HWVALU], latency>;
-
-
-// The latency numbers are taken from AMD Accelerated Parallel Processing
-// guide.  They may not be acurate.
-
-// The latency values are 1 / (operations / cycle) / 4.
-multiclass SICommonWriteRes {
-
-  def : HWWriteRes<WriteBranch,  [HWBranch], 100>; // XXX: Guessed ???
-  def : HWWriteRes<WriteExport,  [HWExport], 100>; // XXX: Guessed ???
-  def : HWWriteRes<WriteLDS,     [HWLGKM],    32>; // 2 - 64
-  def : HWWriteRes<WriteSALU,    [HWSALU],     1>;
-  def : HWWriteRes<WriteSMEM,    [HWLGKM],    10>; // XXX: Guessed ???
-  def : HWWriteRes<WriteVMEM,    [HWVMEM],   450>; // 300 - 600
-
-  def : HWVALUWriteRes<Write32Bit,         1>;
-  def : HWVALUWriteRes<WriteQuarterRate32, 4>;
-}
-
-
-let SchedModel = SIFullSpeedModel in {
-
-defm : SICommonWriteRes;
-
-def : HWVALUWriteRes<WriteFloatFMA,   1>;
-def : HWVALUWriteRes<WriteDouble,     4>;
-def : HWVALUWriteRes<WriteDoubleAdd,  2>;
-
-} // End SchedModel = SIFullSpeedModel
-
-let SchedModel = SIQuarterSpeedModel in {
-
-defm : SICommonWriteRes;
-
-def : HWVALUWriteRes<WriteFloatFMA, 16>;
-def : HWVALUWriteRes<WriteDouble,   16>;
-def : HWVALUWriteRes<WriteDoubleAdd, 8>;
-
-}  // End SchedModel = SIQuarterSpeedModel
diff --git a/lib/Target/R600/SIShrinkInstructions.cpp b/lib/Target/R600/SIShrinkInstructions.cpp
deleted file mode 100644
index 51e72cdb5f9..00000000000
--- a/lib/Target/R600/SIShrinkInstructions.cpp
+++ /dev/null
@@ -1,272 +0,0 @@
-//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// The pass tries to use the 32-bit encoding for instructions when possible.
-//===----------------------------------------------------------------------===//
-//
-
-#include "AMDGPU.h"
-#include "AMDGPUMCInstLower.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-
-#define DEBUG_TYPE "si-shrink-instructions"
-
-STATISTIC(NumInstructionsShrunk,
-          "Number of 64-bit instruction reduced to 32-bit.");
-STATISTIC(NumLiteralConstantsFolded,
-          "Number of literal constants folded into 32-bit instructions.");
-
-namespace llvm {
-  void initializeSIShrinkInstructionsPass(PassRegistry&);
-}
-
-using namespace llvm;
-
-namespace {
-
-class SIShrinkInstructions : public MachineFunctionPass {
-public:
-  static char ID;
-
-public:
-  SIShrinkInstructions() : MachineFunctionPass(ID) {
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI Shrink Instructions";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE,
-                      "SI Lower il Copies", false, false)
-INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE,
-                    "SI Lower il Copies", false, false)
-
-char SIShrinkInstructions::ID = 0;
-
-FunctionPass *llvm::createSIShrinkInstructionsPass() {
-  return new SIShrinkInstructions();
-}
-
-static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
-                   const MachineRegisterInfo &MRI) {
-  if (!MO->isReg())
-    return false;
-
-  if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
-    return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
-
-  return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
-}
-
-static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
-                      const SIRegisterInfo &TRI,
-                      const MachineRegisterInfo &MRI) {
-
-  const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
-  // Can't shrink instruction with three operands.
-  // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
-  // a special case for it.  It can only be shrunk if the third operand
-  // is vcc.  We should handle this the same way we handle vopc, by addding
-  // a register allocation hint pre-regalloc and then do the shrining
-  // post-regalloc.
-  if (Src2)
-    return false;
-
-  const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
-  const MachineOperand *Src1Mod =
-      TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
-
-  if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0)))
-    return false;
-
-  // We don't need to check src0, all input types are legal, so just make sure
-  // src0 isn't using any modifiers.
-  if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
-    return false;
-
-  // Check output modifiers
-  if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
-    return false;
-
-  if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
-    return false;
-
-  return true;
-}
-
-/// \brief This function checks \p MI for operands defined by a move immediate
-/// instruction and then folds the literal constant into the instruction if it
-/// can.  This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
-/// and will only fold literal constants if we are still in SSA.
-static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
-                           MachineRegisterInfo &MRI, bool TryToCommute = true) {
-
-  if (!MRI.isSSA())
-    return;
-
-  assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) ||
-         TII->isVOPC(MI.getOpcode()));
-
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
-  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
-  MachineOperand &Src0 = MI.getOperand(Src0Idx);
-
-  // Only one literal constant is allowed per instruction, so if src0 is a
-  // literal constant then we can't do any folding.
-  if (Src0.isImm() &&
-      TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx)))
-    return;
-
-  // Literal constants and SGPRs can only be used in Src0, so if Src0 is an
-  // SGPR, we cannot commute the instruction, so we can't fold any literal
-  // constants.
-  if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI))
-    return;
-
-  // Try to fold Src0
-  if (Src0.isReg()) {
-    unsigned Reg = Src0.getReg();
-    MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
-    if (Def && Def->isMoveImmediate()) {
-      MachineOperand &MovSrc = Def->getOperand(1);
-      bool ConstantFolded = false;
-
-      if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) {
-        Src0.ChangeToImmediate(MovSrc.getImm());
-        ConstantFolded = true;
-      }
-      if (ConstantFolded) {
-        if (MRI.use_empty(Reg))
-          Def->eraseFromParent();
-        ++NumLiteralConstantsFolded;
-        return;
-      }
-    }
-  }
-
-  // We have failed to fold src0, so commute the instruction and try again.
-  if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI))
-    foldImmediates(MI, TII, MRI, false);
-
-}
-
-bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
-  std::vector<unsigned> I1Defs;
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-                                                  BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
-    MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
-      Next = std::next(I);
-      MachineInstr &MI = *I;
-
-      // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
-      if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
-        const MachineOperand &Src = MI.getOperand(1);
-
-        if (Src.isImm()) {
-          if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4))
-            MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
-        }
-
-        continue;
-      }
-
-      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
-        continue;
-
-      if (!canShrink(MI, TII, TRI, MRI)) {
-        // Try commuting the instruction and see if that enables us to shrink
-        // it.
-        if (!MI.isCommutable() || !TII->commuteInstruction(&MI) ||
-            !canShrink(MI, TII, TRI, MRI))
-          continue;
-      }
-
-      // getVOPe32 could be -1 here if we started with an instruction that had
-      // a 32-bit encoding and then commuted it to an instruction that did not.
-      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
-        continue;
-
-      int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
-
-      if (TII->isVOPC(Op32)) {
-        unsigned DstReg = MI.getOperand(0).getReg();
-        if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
-          // VOPC instructions can only write to the VCC register.  We can't
-          // force them to use VCC here, because the register allocator has
-          // trouble with sequences like this, which cause the allocator to run
-          // out of registers if vreg0 and vreg1 belong to the VCCReg register
-          // class:
-          // vreg0 = VOPC;
-          // vreg1 = VOPC;
-          // S_AND_B64 vreg0, vreg1
-          //
-          // So, instead of forcing the instruction to write to VCC, we provide
-          // a hint to the register allocator to use VCC and then we we will run
-          // this pass again after RA and shrink it if it outputs to VCC.
-          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
-          continue;
-        }
-        if (DstReg != AMDGPU::VCC)
-          continue;
-      }
-
-      // We can shrink this instruction
-      DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';);
-
-      MachineInstrBuilder Inst32 =
-          BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
-
-      // dst
-      Inst32.addOperand(MI.getOperand(0));
-
-      Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
-
-      const MachineOperand *Src1 =
-          TII->getNamedOperand(MI, AMDGPU::OpName::src1);
-      if (Src1)
-        Inst32.addOperand(*Src1);
-
-      ++NumInstructionsShrunk;
-      MI.eraseFromParent();
-
-      foldImmediates(*Inst32, TII, MRI);
-      DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
-
-
-    }
-  }
-  return false;
-}
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
deleted file mode 100644
index 591ce857cc7..00000000000
--- a/lib/Target/R600/SITypeRewriter.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass removes performs the following type substitution on all
-/// non-compute shaders:
-///
-/// v16i8 => i128
-///   - v16i8 is used for constant memory resource descriptors.  This type is
-///      legal for some compute APIs, and we don't want to declare it as legal
-///      in the backend, because we want the legalizer to expand all v16i8
-///      operations.
-/// v1* => *
-///   - Having v1* types complicates the legalizer and we can easily replace
-///   - them with the element type.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-
-using namespace llvm;
-
-namespace {
-
-class SITypeRewriter : public FunctionPass,
-                       public InstVisitor<SITypeRewriter> {
-
-  static char ID;
-  Module *Mod;
-  Type *v16i8;
-  Type *v4i32;
-
-public:
-  SITypeRewriter() : FunctionPass(ID) { }
-  bool doInitialization(Module &M) override;
-  bool runOnFunction(Function &F) override;
-  const char *getPassName() const override {
-    return "SI Type Rewriter";
-  }
-  void visitLoadInst(LoadInst &I);
-  void visitCallInst(CallInst &I);
-  void visitBitCast(BitCastInst &I);
-};
-
-} // End anonymous namespace
-
-char SITypeRewriter::ID = 0;
-
-bool SITypeRewriter::doInitialization(Module &M) {
-  Mod = &M;
-  v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
-  v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4);
-  return false;
-}
-
-bool SITypeRewriter::runOnFunction(Function &F) {
-  Attribute A = F.getFnAttribute("ShaderType");
-
-  unsigned ShaderType = ShaderType::COMPUTE;
-  if (A.isStringAttribute()) {
-    StringRef Str = A.getValueAsString();
-    Str.getAsInteger(0, ShaderType);
-  }
-  if (ShaderType == ShaderType::COMPUTE)
-    return false;
-
-  visit(F);
-  visit(F);
-
-  return false;
-}
-
-void SITypeRewriter::visitLoadInst(LoadInst &I) {
-  Value *Ptr = I.getPointerOperand();
-  Type *PtrTy = Ptr->getType();
-  Type *ElemTy = PtrTy->getPointerElementType();
-  IRBuilder<> Builder(&I);
-  if (ElemTy == v16i8)  {
-    Value *BitCast = Builder.CreateBitCast(Ptr,
-        PointerType::get(v4i32,PtrTy->getPointerAddressSpace()));
-    LoadInst *Load = Builder.CreateLoad(BitCast);
-    SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
-    I.getAllMetadataOtherThanDebugLoc(MD);
-    for (unsigned i = 0, e = MD.size(); i != e; ++i) {
-      Load->setMetadata(MD[i].first, MD[i].second);
-    }
-    Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType());
-    I.replaceAllUsesWith(BitCastLoad);
-    I.eraseFromParent();
-  }
-}
-
-void SITypeRewriter::visitCallInst(CallInst &I) {
-  IRBuilder<> Builder(&I);
-
-  SmallVector <Value*, 8> Args;
-  SmallVector <Type*, 8> Types;
-  bool NeedToReplace = false;
-  Function *F = I.getCalledFunction();
-  std::string Name = F->getName();
-  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
-    Value *Arg = I.getArgOperand(i);
-    if (Arg->getType() == v16i8) {
-      Args.push_back(Builder.CreateBitCast(Arg, v4i32));
-      Types.push_back(v4i32);
-      NeedToReplace = true;
-      Name = Name + ".v4i32";
-    } else if (Arg->getType()->isVectorTy() &&
-               Arg->getType()->getVectorNumElements() == 1 &&
-               Arg->getType()->getVectorElementType() ==
-                                              Type::getInt32Ty(I.getContext())){
-      Type *ElementTy = Arg->getType()->getVectorElementType();
-      std::string TypeName = "i32";
-      InsertElementInst *Def = cast<InsertElementInst>(Arg);
-      Args.push_back(Def->getOperand(1));
-      Types.push_back(ElementTy);
-      std::string VecTypeName = "v1" + TypeName;
-      Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName);
-      NeedToReplace = true;
-    } else {
-      Args.push_back(Arg);
-      Types.push_back(Arg->getType());
-    }
-  }
-
-  if (!NeedToReplace) {
-    return;
-  }
-  Function *NewF = Mod->getFunction(Name);
-  if (!NewF) {
-    NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod);
-    NewF->setAttributes(F->getAttributes());
-  }
-  I.replaceAllUsesWith(Builder.CreateCall(NewF, Args));
-  I.eraseFromParent();
-}
-
-void SITypeRewriter::visitBitCast(BitCastInst &I) {
-  IRBuilder<> Builder(&I);
-  if (I.getDestTy() != v4i32) {
-    return;
-  }
-
-  if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
-    if (Op->getSrcTy() == v4i32) {
-      I.replaceAllUsesWith(Op->getOperand(0));
-      I.eraseFromParent();
-    }
-  }
-}
-
-FunctionPass *llvm::createSITypeRewriter() {
-  return new SITypeRewriter();
-}
diff --git a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
deleted file mode 100644
index d723d6e3e8b..00000000000
--- a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUTargetMachine.h"
-#include "llvm/Support/TargetRegistry.h"
-
-using namespace llvm;
-
-/// \brief The target which suports all AMD GPUs.  This will eventually
-///         be deprecated and there will be a R600 target and a GCN target.
-Target llvm::TheAMDGPUTarget;
-/// \brief The target for GCN GPUs
-Target llvm::TheGCNTarget;
-
-/// \brief Extern function to initialize the targets for the AMDGPU backend
-extern "C" void LLVMInitializeR600TargetInfo() {
-  RegisterTarget<Triple::r600, false>
-    R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX");
-  RegisterTarget<Triple::amdgcn, false> GCN(TheGCNTarget, "amdgcn", "AMD GCN GPUs");
-}
diff --git a/lib/Target/R600/TargetInfo/CMakeLists.txt b/lib/Target/R600/TargetInfo/CMakeLists.txt
deleted file mode 100644
index c3bd26c7a89..00000000000
--- a/lib/Target/R600/TargetInfo/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_llvm_library(LLVMR600Info
-  AMDGPUTargetInfo.cpp
-  )
diff --git a/lib/Target/R600/TargetInfo/LLVMBuild.txt b/lib/Target/R600/TargetInfo/LLVMBuild.txt
deleted file mode 100644
index c3d3cf51cc8..00000000000
--- a/lib/Target/R600/TargetInfo/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/R600/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = R600Info
-parent = R600
-required_libraries = Support
-add_to_library_groups = R600
diff --git a/lib/Target/R600/TargetInfo/Makefile b/lib/Target/R600/TargetInfo/Makefile
deleted file mode 100644
index b8ac4e78230..00000000000
--- a/lib/Target/R600/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/AMDGPU/TargetInfo/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMR600Info
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/R600/VIInstrFormats.td b/lib/Target/R600/VIInstrFormats.td
deleted file mode 100644
index d8738f99263..00000000000
--- a/lib/Target/R600/VIInstrFormats.td
+++ /dev/null
@@ -1,166 +0,0 @@
-//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// VI Instruction format definitions.
-//
-//===----------------------------------------------------------------------===//
-
-class DSe_vi <bits<8> op> : Enc64 {
-  bits<8> vdst;
-  bits<1> gds;
-  bits<8> addr;
-  bits<8> data0;
-  bits<8> data1;
-  bits<8> offset0;
-  bits<8> offset1;
-
-  let Inst{7-0} = offset0;
-  let Inst{15-8} = offset1;
-  let Inst{16} = gds;
-  let Inst{24-17} = op;
-  let Inst{31-26} = 0x36; //encoding
-  let Inst{39-32} = addr;
-  let Inst{47-40} = data0;
-  let Inst{55-48} = data1;
-  let Inst{63-56} = vdst;
-}
-
-class MUBUFe_vi <bits<7> op> : Enc64 {
-  bits<12> offset;
-  bits<1> offen;
-  bits<1> idxen;
-  bits<1> glc;
-  bits<1> lds;
-  bits<8> vaddr;
-  bits<8> vdata;
-  bits<7> srsrc;
-  bits<1> slc;
-  bits<1> tfe;
-  bits<8> soffset;
-
-  let Inst{11-0} = offset;
-  let Inst{12} = offen;
-  let Inst{13} = idxen;
-  let Inst{14} = glc;
-  let Inst{16} = lds;
-  let Inst{17} = slc;
-  let Inst{24-18} = op;
-  let Inst{31-26} = 0x38; //encoding
-  let Inst{39-32} = vaddr;
-  let Inst{47-40} = vdata;
-  let Inst{52-48} = srsrc{6-2};
-  let Inst{55} = tfe;
-  let Inst{63-56} = soffset;
-}
-
-class MTBUFe_vi <bits<4> op> : Enc64 {
-  bits<12> offset;
-  bits<1>  offen;
-  bits<1>  idxen;
-  bits<1>  glc;
-  bits<4>  dfmt;
-  bits<3>  nfmt;
-  bits<8>  vaddr;
-  bits<8>  vdata;
-  bits<7>  srsrc;
-  bits<1>  slc;
-  bits<1>  tfe;
-  bits<8>  soffset;
-
-  let Inst{11-0}  = offset;
-  let Inst{12}    = offen;
-  let Inst{13}    = idxen;
-  let Inst{14}    = glc;
-  let Inst{18-15} = op;
-  let Inst{22-19} = dfmt;
-  let Inst{25-23} = nfmt;
-  let Inst{31-26} = 0x3a; //encoding
-  let Inst{39-32} = vaddr;
-  let Inst{47-40} = vdata;
-  let Inst{52-48} = srsrc{6-2};
-  let Inst{54}    = slc;
-  let Inst{55}    = tfe;
-  let Inst{63-56} = soffset;
-}
-
-class SMEMe_vi <bits<8> op, bit imm> : Enc64 {
-  bits<7>  sbase;
-  bits<7>  sdata;
-  bits<1>  glc;
-  bits<20> offset;
-
-  let Inst{5-0}   = sbase{6-1};
-  let Inst{12-6}  = sdata;
-  let Inst{16}    = glc;
-  let Inst{17}    = imm;
-  let Inst{25-18} = op;
-  let Inst{31-26} = 0x30; //encoding
-  let Inst{51-32} = offset;
-}
-
-class VOP3e_vi <bits<10> op> : Enc64 {
-  bits<8> vdst;
-  bits<2> src0_modifiers;
-  bits<9> src0;
-  bits<2> src1_modifiers;
-  bits<9> src1;
-  bits<2> src2_modifiers;
-  bits<9> src2;
-  bits<1> clamp;
-  bits<2> omod;
-
-  let Inst{7-0}   = vdst;
-  let Inst{8}     = src0_modifiers{1};
-  let Inst{9}     = src1_modifiers{1};
-  let Inst{10}    = src2_modifiers{1};
-  let Inst{15}    = clamp;
-  let Inst{25-16} = op;
-  let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = src0;
-  let Inst{49-41} = src1;
-  let Inst{58-50} = src2;
-  let Inst{60-59} = omod;
-  let Inst{61} = src0_modifiers{0};
-  let Inst{62} = src1_modifiers{0};
-  let Inst{63} = src2_modifiers{0};
-}
-
-class VOP3be_vi <bits<10> op> : Enc64 {
-  bits<8> vdst;
-  bits<2> src0_modifiers;
-  bits<9> src0;
-  bits<2> src1_modifiers;
-  bits<9> src1;
-  bits<2> src2_modifiers;
-  bits<9> src2;
-  bits<7> sdst;
-  bits<2> omod;
-  bits<1> clamp;
-
-  let Inst{7-0} = vdst;
-  let Inst{14-8} = sdst;
-  let Inst{15} = clamp;
-  let Inst{25-16} = op;
-  let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = src0;
-  let Inst{49-41} = src1;
-  let Inst{58-50} = src2;
-  let Inst{60-59} = omod;
-  let Inst{61} = src0_modifiers{0};
-  let Inst{62} = src1_modifiers{0};
-  let Inst{63} = src2_modifiers{0};
-}
-
-class EXPe_vi : EXPe {
-  let Inst{31-26} = 0x31; //encoding
-}
-
-class VINTRPe_vi <bits<2> op> : VINTRPe <op> {
-  let Inst{31-26} = 0x35; // encoding
-}
diff --git a/lib/Target/R600/VIInstructions.td b/lib/Target/R600/VIInstructions.td
deleted file mode 100644
index 5bf86e649ce..00000000000
--- a/lib/Target/R600/VIInstructions.td
+++ /dev/null
@@ -1,106 +0,0 @@
-//===-- VIInstructions.td - VI Instruction Defintions ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// Instruction definitions for VI and newer.
-//===----------------------------------------------------------------------===//
-
-let SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI in {
-
-//===----------------------------------------------------------------------===//
-// VOP1 Instructions
-//===----------------------------------------------------------------------===//
-
-defm V_CVT_F16_U16 : VOP1Inst <vop1<0, 0x39>, "v_cvt_f16_u16", VOP_F16_I16>;
-defm V_CVT_F16_I16 : VOP1Inst <vop1<0, 0x3a>, "v_cvt_f16_i16", VOP_F16_I16>;
-defm V_CVT_U16_F16 : VOP1Inst <vop1<0, 0x3b>, "v_cvt_u16_f16", VOP_I16_F16>;
-defm V_CVT_I16_F16 : VOP1Inst <vop1<0, 0x3c>, "v_cvt_i16_f16", VOP_I16_F16>;
-defm V_RCP_F16 : VOP1Inst <vop1<0, 0x3d>, "v_rcp_f16", VOP_F16_F16>;
-defm V_SQRT_F16 : VOP1Inst <vop1<0, 0x3e>, "v_sqrt_f16", VOP_F16_F16>;
-defm V_RSQ_F16 : VOP1Inst <vop1<0, 0x3f>, "v_rsq_f16", VOP_F16_F16>;
-defm V_LOG_F16 : VOP1Inst <vop1<0, 0x40>, "v_log_f16", VOP_F16_F16>;
-defm V_EXP_F16 : VOP1Inst <vop1<0, 0x41>, "v_exp_f16", VOP_F16_F16>;
-defm V_FREXP_MANT_F16 : VOP1Inst <vop1<0, 0x42>, "v_frexp_mant_f16",
-  VOP_F16_F16
->;
-defm V_FREXP_EXP_I16_F16 : VOP1Inst <vop1<0, 0x43>, "v_frexp_exp_i16_f16",
-  VOP_I16_F16
->;
-defm V_FLOOR_F16 : VOP1Inst <vop1<0, 0x44>, "v_floor_f16", VOP_F16_F16>;
-defm V_CEIL_F16 : VOP1Inst <vop1<0, 0x45>, "v_ceil_f16", VOP_F16_F16>;
-defm V_TRUNC_F16 : VOP1Inst <vop1<0, 0x46>, "v_trunc_f16", VOP_F16_F16>;
-defm V_RNDNE_F16 : VOP1Inst <vop1<0, 0x47>, "v_rndne_f16", VOP_F16_F16>;
-defm V_FRACT_F16 : VOP1Inst <vop1<0, 0x48>, "v_fract_f16", VOP_F16_F16>;
-defm V_SIN_F16 : VOP1Inst <vop1<0, 0x49>, "v_sin_f16", VOP_F16_F16>;
-defm V_COS_F16 : VOP1Inst <vop1<0, 0x4a>, "v_cos_f16", VOP_F16_F16>;
-
-//===----------------------------------------------------------------------===//
-// VOP2 Instructions
-//===----------------------------------------------------------------------===//
-
-let isCommutable = 1 in {
-
-defm V_ADD_F16 : VOP2Inst <vop2<0, 0x1f>, "v_add_f16", VOP_F16_F16_F16>;
-defm V_SUB_F16 : VOP2Inst <vop2<0, 0x20>, "v_sub_f16", VOP_F16_F16_F16>;
-defm V_SUBREV_F16 : VOP2Inst <vop2<0, 0x21>, "v_subrev_f16", VOP_F16_F16_F16,
-  null_frag, "v_sub_f16"
->;
-defm V_MUL_F16 : VOP2Inst <vop2<0, 0x22>, "v_mul_f16", VOP_F16_F16_F16>;
-defm V_MAC_F16 : VOP2Inst <vop2<0, 0x23>, "v_mac_f16", VOP_F16_F16_F16>;
-} // End isCommutable = 1
-defm V_MADMK_F16 : VOP2MADK <vop2<0,0x24>, "v_madmk_f16">;
-let isCommutable = 1 in {
-defm V_MADAK_F16 : VOP2MADK <vop2<0,0x25>, "v_madak_f16">;
-defm V_ADD_U16 : VOP2Inst <vop2<0,0x26>, "v_add_u16", VOP_I16_I16_I16>;
-defm V_SUB_U16 : VOP2Inst <vop2<0,0x27>, "v_sub_u16" , VOP_I16_I16_I16>;
-defm V_SUBREV_U16 : VOP2Inst <vop2<0,0x28>, "v_subrev_u16", VOP_I16_I16_I16>;
-defm V_MUL_LO_U16 : VOP2Inst <vop2<0,0x29>, "v_mul_lo_u16", VOP_I16_I16_I16>;
-} // End isCommutable = 1
-defm V_LSHLREV_B16 : VOP2Inst <vop2<0,0x2a>, "v_lshlrev_b16", VOP_I16_I16_I16>;
-defm V_LSHRREV_B16 : VOP2Inst <vop2<0,0x2b>, "v_lshrrev_b16", VOP_I16_I16_I16>;
-defm V_ASHRREV_B16 : VOP2Inst <vop2<0,0x2c>, "v_ashrrev_b16", VOP_I16_I16_I16>;
-let isCommutable = 1 in {
-defm V_MAX_F16 : VOP2Inst <vop2<0,0x2d>, "v_max_f16", VOP_F16_F16_F16>;
-defm V_MIN_F16 : VOP2Inst <vop2<0,0x2e>, "v_min_f16", VOP_F16_F16_F16>;
-defm V_MAX_U16 : VOP2Inst <vop2<0,0x2f>, "v_max_u16", VOP_I16_I16_I16>;
-defm V_MAX_I16 : VOP2Inst <vop2<0,0x30>, "v_max_i16", VOP_I16_I16_I16>;
-defm V_MIN_U16 : VOP2Inst <vop2<0,0x31>, "v_min_u16", VOP_I16_I16_I16>;
-defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>;
-} // End isCommutable = 1
-defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>;
-
-// Aliases to simplify matching of floating-pint instructions that are VOP2 on
-// SI and VOP3 on VI.
-
-class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
-  name#" $dst, $src0, $src1",
-  (inst VGPR_32:$dst, 0, VCSrc_32:$src0, 0, VCSrc_32:$src1, 0, 0)
->, PredicateControl {
-  let UseInstAsmMatchConverter = 0;
-}
-
-def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>;
-def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>;
-def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>;
-def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
-def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
-
-} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
-
-//===----------------------------------------------------------------------===//
-// SMEM Patterns
-//===----------------------------------------------------------------------===//
-
-let Predicates = [isVI] in {
-
-// 1. Offset as 20bit DWORD immediate
-def : Pat <
-  (SIload_constant v4i32:$sbase, IMM20bit:$offset),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
->;
-
-} // End Predicates = [isVI]
diff --git a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
new file mode 100644
index 00000000000..c7bcfd2ddab
--- /dev/null
+++ b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -0,0 +1,139 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and
+; the global address space(1) uses 64-bit pointers.  These tests check to make sure
+; the correct pointer size is used for the local address space.
+
+; The e{{32|64}} suffix on the instructions refers to the encoding size and not
+; the size of the operands.  The operand size is denoted in the instruction name.
+; Instructions with B32, U32, and I32 in their name take 32-bit operands, while
+; instructions with B64, U64, and I64 take 64-bit operands.
+
+; FUNC-LABEL: {{^}}local_address_load:
+; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
+; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
+define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(3)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_address_gep:
+; SI: s_add_i32 [[SPTR:s[0-9]]]
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_read_b32 [[VPTR]]
+define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset
+  %1 = load i32, i32 addrspace(3)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_address_gep_const_offset:
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
+define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1
+  %1 = load i32, i32 addrspace(3)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Offset too large, can't fold into 16-bit immediate offset.
+; FUNC-LABEL: {{^}}local_address_gep_large_const_offset:
+; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_read_b32 [[VPTR]]
+define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385
+  %1 = load i32, i32 addrspace(3)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}null_32bit_lds_ptr:
+; SI: v_cmp_ne_i32
+; SI-NOT: v_cmp_ne_i32
+; SI: v_cndmask_b32
+define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
+  %cmp = icmp ne i32 addrspace(3)* %lds, null
+  %x = select i1 %cmp, i32 123, i32 456
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mul_32bit_ptr:
+; SI: s_mul_i32
+; SI-NEXT: s_add_i32
+; SI: ds_read_b32
+define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
+  %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
+  %val = load float, float addrspace(3)* %ptr
+  store float %val, float addrspace(1)* %out
+  ret void
+}
+
+@g_lds = addrspace(3) global float undef, align 4
+
+; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
+define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
+  %val = load float, float addrspace(3)* @g_lds
+  store float %val, float addrspace(1)* %out
+  ret void
+}
+
+
+@ptr = addrspace(3) global i32 addrspace(3)* undef
+@dst = addrspace(3) global [16384 x i32] undef
+
+; FUNC-LABEL: {{^}}global_ptr:
+; SI: ds_write_b32
+define void @global_ptr() nounwind {
+  store i32 addrspace(3)* getelementptr ([16384 x i32], [16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_address_store:
+; SI: ds_write_b32
+define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
+  store i32 %val, i32 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_address_gep_store:
+; SI: s_add_i32 [[SADDR:s[0-9]+]],
+; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
+; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
+define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
+  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset
+  store i32 %val, i32 addrspace(3)* %gep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_address_gep_const_offset_store:
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
+; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
+; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
+define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1
+  store i32 %val, i32 addrspace(3)* %gep, align 4
+  ret void
+}
+
+; Offset too large, can't fold into 16-bit immediate offset.
+; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store:
+; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
+define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385
+  store i32 %val, i32 addrspace(3)* %gep, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/README b/test/CodeGen/AMDGPU/README
new file mode 100644
index 00000000000..96998bba28f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/README
@@ -0,0 +1,21 @@
++==============================================================================+
+| How to organize the lit tests                                                |
++==============================================================================+
+
+- If you write a test for matching a single DAG opcode or intrinsic, it should
+  go in a file called {opcode_name,intrinsic_name}.ll (e.g. fadd.ll)
+
+- If you write a test that matches several DAG opcodes and checks for a single
+  ISA instruction, then that test should go in a file called {ISA_name}.ll (e.g.
+  bfi_int.ll
+
+- For all other tests, use your best judgement for organizing tests and naming
+  the files.
+
++==============================================================================+
+| Naming conventions                                                           |
++==============================================================================+
+
+- Use dash '-' and not underscore '_' to separate words in file names, unless
+  the file is named after a DAG opcode or ISA instruction that has an
+  underscore '_' in its name.
diff --git a/test/CodeGen/AMDGPU/add-debug.ll b/test/CodeGen/AMDGPU/add-debug.ll
new file mode 100644
index 00000000000..529905dd36a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/add-debug.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -debug
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -debug
+; REQUIRES: asserts
+
+; Check that SelectionDAGDumper does not crash on int_SI_if.
+define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll
new file mode 100644
index 00000000000..655e75dbc1a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/add.ll
@@ -0,0 +1,192 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+;FUNC-LABEL: {{^}}test1:
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: v_add_i32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}}
+;SI-NOT: [[REG]]
+;SI: buffer_store_dword [[REG]],
+define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test2:
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
+  %result = add <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test4:
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
+  %result = add <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test8:
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
+entry:
+  %0 = add <8 x i32> %a, %b
+  store <8 x i32> %0, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test16:
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
+entry:
+  %0 = add <16 x i32> %a, %b
+  store <16 x i32> %0, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}add64:
+; SI: s_add_u32
+; SI: s_addc_u32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG-DAG: ADDC_UINT
+; EG-DAG: ADD_INT
+; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
+define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = add i64 %a, %b
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
+
+; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they
+; use VCC.  The test is designed so that %a will be stored in an SGPR and
+; %0 will be stored in a VGPR, so the comiler will be forced to copy %a
+; to a VGPR before doing the add.
+
+; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
+; SI-NOT: v_addc_u32_e32 s
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG-DAG: ADDC_UINT
+; EG-DAG: ADD_INT
+; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
+define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
+entry:
+  %0 = load i64, i64 addrspace(1)* %in
+  %1 = add i64 %a, %0
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+; Test i64 add inside a branch.
+; FUNC-LABEL: {{^}}add64_in_branch:
+; SI: s_add_u32
+; SI: s_addc_u32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG-DAG: ADDC_UINT
+; EG-DAG: ADD_INT
+; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
+define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/add_i64.ll b/test/CodeGen/AMDGPU/add_i64.ll
new file mode 100644
index 00000000000..8346add7df9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/add_i64.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+
+
+declare i32 @llvm.r600.read.tidig.x() readnone
+
+; SI-LABEL: {{^}}test_i64_vreg:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a_ptr
+  %b = load i64, i64 addrspace(1)* %b_ptr
+  %result = add i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; Check that the SGPR add operand is correctly moved to a VGPR.
+; SI-LABEL: {{^}}sgpr_operand:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
+  %foo = load i64, i64 addrspace(1)* %in, align 8
+  %result = add i64 %foo, %a
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; Swap the arguments. Check that the SGPR -> VGPR copy works with the
+; SGPR as other operand.
+;
+; SI-LABEL: {{^}}sgpr_operand_reversed:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
+  %foo = load i64, i64 addrspace(1)* %in, align 8
+  %result = add i64 %a, %foo
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+
+; SI-LABEL: {{^}}test_v2i64_sreg:
+; SI: s_add_u32
+; SI: s_addc_u32
+; SI: s_add_u32
+; SI: s_addc_u32
+define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
+  %result = add <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}test_v2i64_vreg:
+; SI: v_add_i32
+; SI: v_addc_u32
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
+  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
+  %result = add <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}trunc_i64_add_to_i32:
+; SI: s_load_dword s[[SREG0:[0-9]+]]
+; SI: s_load_dword s[[SREG1:[0-9]+]]
+; SI: s_add_i32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
+; SI-NOT: addc
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+  %add = add i64 %b, %a
+  %trunc = trunc i64 %add to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/address-space.ll b/test/CodeGen/AMDGPU/address-space.ll
new file mode 100644
index 00000000000..4be8c584752
--- /dev/null
+++ b/test/CodeGen/AMDGPU/address-space.ll
@@ -0,0 +1,36 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; Test that codegenprepare understands address space sizes
+
+%struct.foo = type { [3 x float], [3 x float] }
+
+; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is
+; already in a VGPR after the first read.
+
+; CHECK-LABEL: {{^}}do_as_ptr_calcs:
+; CHECK: s_load_dword [[SREG1:s[0-9]+]],
+; CHECK: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG1]]
+; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
+; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12
+; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG2]] offset:20
+define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
+entry:
+  %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
+  %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
+  br label %bb32
+
+bb32:
+  %a = load float, float addrspace(3)* %x, align 4
+  %b = load float, float addrspace(3)* %y, align 4
+  %cmp = fcmp one float %a, %b
+  br i1 %cmp, label %bb34, label %bb33
+
+bb33:
+  unreachable
+
+bb34:
+  unreachable
+}
+
+
diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll
new file mode 100644
index 00000000000..5672d470bd7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/and.ll
@@ -0,0 +1,296 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test2:
+; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = and <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test4:
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = and <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_i32:
+; SI: s_and_b32
+define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %and = and i32 %a, %b
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_constant_i32:
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
+define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
+  %and = and i32 %a, 1234567
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_i32:
+; SI: v_and_b32
+define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %and = and i32 %a, %b
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_constant_i32
+; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
+define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %and = and i32 %a, 1234567
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
+; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
+define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %and = and i32 %a, 64
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
+; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
+define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %and = and i32 %a, -16
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_i64
+; SI: s_and_b64
+define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %and = and i64 %a, %b
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: Should use SGPRs
+; FUNC-LABEL: {{^}}s_and_i1:
+; SI: v_and_b32
+define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
+  %and = and i1 %a, %b
+  store i1 %and, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_constant_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+  %and = and i64 %a, 281474976710655
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_i64:
+; SI: v_and_b32
+; SI: v_and_b32
+define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %and = and i64 %a, %b
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_i64_br:
+; SI: v_and_b32
+; SI: v_and_b32
+define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i32 %cond) {
+entry:
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %endif
+
+if:
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %and = and i64 %a, %b
+  br label %endif
+
+endif:
+  %tmp1 = phi i64 [%and, %if], [0, %entry]
+  store i64 %tmp1, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_constant_i64:
+; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %and = and i64 %a, 1234567
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: Replace and 0 with mov 0
+; FUNC-LABEL: {{^}}v_and_inline_imm_i64:
+; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
+; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
+define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %and = and i64 %a, 64
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64
+define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 64
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1
+define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 1
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0
+define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4607182418800017408
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0
+define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13830554455654793216
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5
+define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4602678819172646912
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5
+define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13826050856027422720
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 2.0
+define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4611686018427387904
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -2.0
+define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13835058055282163712
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0
+define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4616189618054758400
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0
+define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13839561654909534208
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+
+; Test with the 64-bit integer bitpattern for a 32-bit float in the
+; low 32-bits, which is not a valid 64-bit inline immmediate.
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}}
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 1082130432
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: Copy of -1 register
+; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], -4.0
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}}
+; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]]
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}
+define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, -1065353216
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Shift into upper 32-bits
+; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4647714815446351872
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13871086852301127680
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/anyext.ll b/test/CodeGen/AMDGPU/anyext.ll
new file mode 100644
index 00000000000..48d8f312249
--- /dev/null
+++ b/test/CodeGen/AMDGPU/anyext.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: {{^}}anyext_i1_i32:
+; CHECK: v_cndmask_b32_e64
+define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  %1 = zext i1 %0 to i8
+  %2 = xor i8 %1, -1
+  %3 = and i8 %2, 1
+  %4 = zext i8 %3 to i32
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
new file mode 100644
index 00000000000..8c2a0795860
--- /dev/null
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -0,0 +1,44 @@
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+
+declare i32 @llvm.SI.tid() nounwind readnone
+declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+
+; The required pointer calculations for the alloca'd actually requires
+; an add and won't be folded into the addressing, which fails with a
+; 64-bit pointer add. This should work since private pointers should
+; be 32-bits.
+
+; SI-LABEL: {{^}}test_private_array_ptr_calc:
+
+; FIXME: We end up with zero argument for ADD, because
+; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
+; with the appropriate offset.  We should fold this into the store.
+; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}}
+; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}]
+;
+; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
+; alloca to a vector.  It currently fails because it does not know how
+; to interpret:
+; getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
+
+; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], 16
+; SI-PROMOTE: ds_write_b32 [[PTRREG]]
+define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+  %alloca = alloca [4 x i32], i32 4, align 16
+  %tid = call i32 @llvm.SI.tid() readnone
+  %a_ptr = getelementptr i32, i32 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid
+  %a = load i32, i32 addrspace(1)* %a_ptr
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
+  store i32 %result, i32* %alloca_ptr, align 4
+  ; Dummy call
+  call void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+  %reload = load i32, i32* %alloca_ptr, align 4
+  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  store i32 %reload, i32 addrspace(1)* %out_ptr, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
new file mode 100644
index 00000000000..eae095eb844
--- /dev/null
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.SI.tid() readnone
+
+; SI-LABEL: {{^}}test_array_ptr_calc:
+; SI: v_mul_lo_i32
+; SI: v_mul_hi_i32
+define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.SI.tid() readnone
+  %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid
+  %a = load i32, i32 addrspace(1)* %a_ptr
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
new file mode 100644
index 00000000000..ef2560ef184
--- /dev/null
+++ b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC  %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SICI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
+; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
+; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
+; GCN: s_endpgm
+define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
+; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
+; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
+; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
+define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
+  %result = extractvalue { i64, i1 } %pair, 0
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
+; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
+  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
+; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
+; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
+; GCN: s_endpgm
+define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
+; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
+; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
+; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
+  %result = extractvalue { i64, i1 } %pair, 0
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/atomic_load_add.ll b/test/CodeGen/AMDGPU/atomic_load_add.ll
new file mode 100644
index 00000000000..20c685447ee
--- /dev/null
+++ b/test/CodeGen/AMDGPU/atomic_load_add.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}atomic_add_local:
+; R600: LDS_ADD *
+; SI: ds_add_u32
+define void @atomic_add_local(i32 addrspace(3)* %local) {
+   %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+   ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_local_const_offset:
+; R600: LDS_ADD *
+; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
+  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
+  %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_ret_local:
+; R600: LDS_ADD_RET *
+; SI: ds_add_rtn_u32
+define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+  %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset:
+; R600: LDS_ADD_RET *
+; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
+define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
+  %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/atomic_load_sub.ll b/test/CodeGen/AMDGPU/atomic_load_sub.ll
new file mode 100644
index 00000000000..4c6f45525b9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/atomic_load_sub.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}atomic_sub_local:
+; R600: LDS_SUB *
+; SI: ds_sub_u32
+define void @atomic_sub_local(i32 addrspace(3)* %local) {
+   %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
+   ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_local_const_offset:
+; R600: LDS_SUB *
+; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
+  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
+  %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_ret_local:
+; R600: LDS_SUB_RET *
+; SI: ds_sub_rtn_u32
+define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+  %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset:
+; R600: LDS_SUB_RET *
+; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
+define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
+  %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll
new file mode 100644
index 00000000000..abdc4afef47
--- /dev/null
+++ b/test/CodeGen/AMDGPU/basic-branch.ll
@@ -0,0 +1,16 @@
+; XFAIL: *
+; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}test_branch(
+define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+  %cmp = icmp ne i32 %val, 0
+  br i1 %cmp, label %store, label %end
+
+store:
+  store i32 222, i32 addrspace(1)* %out
+  ret void
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/basic-loop.ll b/test/CodeGen/AMDGPU/basic-loop.ll
new file mode 100644
index 00000000000..f0263caf5d6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/basic-loop.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s
+; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}test_loop:
+define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+entry:
+  br label %loop.body
+
+loop.body:
+  %i = phi i32 [0, %entry], [%i.inc, %loop.body]
+  store i32 222, i32 addrspace(1)* %out
+  %cmp = icmp ne i32 %i, %val
+  %i.inc = add i32 %i, 1
+  br i1 %cmp, label %loop.body, label %end
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/bfe_uint.ll b/test/CodeGen/AMDGPU/bfe_uint.ll
new file mode 100644
index 00000000000..32e3fc26106
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bfe_uint.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: {{^}}bfe_def:
+; CHECK: BFE_UINT
+define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
+entry:
+  %0 = lshr i32 %x, 5
+  %1 = and i32 %0, 15 ; 0xf
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; This program could be implemented using a BFE_UINT instruction, however
+; since the lshr constant + number of bits in the mask is >= 32, it can also be
+; implmented with a LSHR instruction, which is better, because LSHR has less
+; operands and requires less constants.
+
+; CHECK: {{^}}bfe_shift:
+; CHECK-NOT: BFE_UINT
+define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
+entry:
+  %0 = lshr i32 %x, 16
+  %1 = and i32 %0, 65535 ; 0xffff
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/bfi_int.ll b/test/CodeGen/AMDGPU/bfi_int.ll
new file mode 100644
index 00000000000..03349349735
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bfi_int.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+; BFI_INT Definition pattern from ISA docs
+; (y & x) | (z & ~x)
+;
+; R600: {{^}}bfi_def:
+; R600: BFI_INT
+; SI:   @bfi_def
+; SI:   v_bfi_b32
+define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+entry:
+  %0 = xor i32 %x, -1
+  %1 = and i32 %z, %0
+  %2 = and i32 %y, %x
+  %3 = or i32 %1, %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; SHA-256 Ch function
+; z ^ (x & (y ^ z))
+; R600: {{^}}bfi_sha256_ch:
+; R600: BFI_INT
+; SI:   @bfi_sha256_ch
+; SI:   v_bfi_b32
+define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+entry:
+  %0 = xor i32 %y, %z
+  %1 = and i32 %x, %0
+  %2 = xor i32 %z, %1
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; SHA-256 Ma function
+; ((x & z) | (y & (x | z)))
+; R600: {{^}}bfi_sha256_ma:
+; R600: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W
+; R600: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W
+; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
+; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
+
+define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+entry:
+  %0 = and i32 %x, %z
+  %1 = or i32 %x, %z
+  %2 = and i32 %y, %1
+  %3 = or i32 %0, %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/big_alu.ll b/test/CodeGen/AMDGPU/big_alu.ll
new file mode 100644
index 00000000000..2671c5d102b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/big_alu.ll
@@ -0,0 +1,1173 @@
+;RUN: llc < %s -march=r600 -mcpu=cedar
+
+;This test ensures that R600 backend can handle ifcvt properly
+;and do not generate ALU clauses with more than 128 instructions.
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg0, i32 0
+  %1 = extractelement <4 x float> %reg0, i32 1
+  %2 = extractelement <4 x float> %reg0, i32 2
+  %3 = extractelement <4 x float> %reg0, i32 3
+  %4 = extractelement <4 x float> %reg1, i32 0
+  %5 = extractelement <4 x float> %reg9, i32 0
+  %6 = extractelement <4 x float> %reg8, i32 0
+  %7 = fcmp ugt float %6, 0.000000e+00
+  %8 = select i1 %7, float %4, float %5
+  %9 = extractelement <4 x float> %reg1, i32 1
+  %10 = extractelement <4 x float> %reg9, i32 1
+  %11 = extractelement <4 x float> %reg8, i32 0
+  %12 = fcmp ugt float %11, 0.000000e+00
+  %13 = select i1 %12, float %9, float %10
+  %14 = extractelement <4 x float> %reg1, i32 2
+  %15 = extractelement <4 x float> %reg9, i32 2
+  %16 = extractelement <4 x float> %reg8, i32 0
+  %17 = fcmp ugt float %16, 0.000000e+00
+  %18 = select i1 %17, float %14, float %15
+  %19 = extractelement <4 x float> %reg1, i32 3
+  %20 = extractelement <4 x float> %reg9, i32 3
+  %21 = extractelement <4 x float> %reg8, i32 0
+  %22 = extractelement <4 x float> %reg2, i32 0
+  %23 = extractelement <4 x float> %reg2, i32 1
+  %24 = extractelement <4 x float> %reg2, i32 2
+  %25 = extractelement <4 x float> %reg2, i32 3
+  %26 = extractelement <4 x float> %reg3, i32 0
+  %27 = extractelement <4 x float> %reg3, i32 1
+  %28 = extractelement <4 x float> %reg3, i32 2
+  %29 = extractelement <4 x float> %reg3, i32 3
+  %30 = extractelement <4 x float> %reg4, i32 0
+  %31 = extractelement <4 x float> %reg4, i32 1
+  %32 = extractelement <4 x float> %reg4, i32 2
+  %33 = extractelement <4 x float> %reg4, i32 3
+  %34 = extractelement <4 x float> %reg5, i32 0
+  %35 = extractelement <4 x float> %reg5, i32 1
+  %36 = extractelement <4 x float> %reg5, i32 2
+  %37 = extractelement <4 x float> %reg5, i32 3
+  %38 = extractelement <4 x float> %reg6, i32 0
+  %39 = extractelement <4 x float> %reg6, i32 1
+  %40 = extractelement <4 x float> %reg6, i32 2
+  %41 = extractelement <4 x float> %reg6, i32 3
+  %42 = extractelement <4 x float> %reg7, i32 0
+  %43 = extractelement <4 x float> %reg7, i32 1
+  %44 = extractelement <4 x float> %reg7, i32 2
+  %45 = extractelement <4 x float> %reg7, i32 3
+  %46 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %47 = extractelement <4 x float> %46, i32 0
+  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %49 = extractelement <4 x float> %48, i32 1
+  %50 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %51 = extractelement <4 x float> %50, i32 2
+  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
+  %53 = extractelement <4 x float> %52, i32 0
+  %54 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %55 = extractelement <4 x float> %54, i32 0
+  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %57 = extractelement <4 x float> %56, i32 1
+  %58 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %59 = extractelement <4 x float> %58, i32 2
+  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %61 = extractelement <4 x float> %60, i32 3
+  %62 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %63 = extractelement <4 x float> %62, i32 0
+  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %65 = extractelement <4 x float> %64, i32 1
+  %66 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %67 = extractelement <4 x float> %66, i32 2
+  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %69 = extractelement <4 x float> %68, i32 0
+  %70 = fcmp oge float %69, 3.500000e+00
+  %71 = sext i1 %70 to i32
+  %72 = bitcast i32 %71 to float
+  %73 = bitcast float %72 to i32
+  %74 = icmp ne i32 %73, 0
+  %. = select i1 %74, float 0.000000e+00, float 0.000000e+00
+  %75 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %76 = extractelement <4 x float> %75, i32 0
+  %77 = fcmp oge float %76, 2.000000e+00
+  %78 = sext i1 %77 to i32
+  %79 = bitcast i32 %78 to float
+  %80 = bitcast float %79 to i32
+  %81 = icmp ne i32 %80, 0
+  br i1 %81, label %IF137, label %ENDIF136
+
+IF137:                                            ; preds = %main_body
+  %82 = insertelement <4 x float> undef, float %30, i32 0
+  %83 = insertelement <4 x float> %82, float %31, i32 1
+  %84 = insertelement <4 x float> %83, float %32, i32 2
+  %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
+  %86 = insertelement <4 x float> undef, float %30, i32 0
+  %87 = insertelement <4 x float> %86, float %31, i32 1
+  %88 = insertelement <4 x float> %87, float %32, i32 2
+  %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
+  %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89)
+  %91 = call float @llvm.AMDGPU.rsq.f32(float %90)
+  %92 = fmul float %30, %91
+  %93 = fmul float %31, %91
+  %94 = fmul float %32, %91
+  %95 = insertelement <4 x float> undef, float %92, i32 0
+  %96 = insertelement <4 x float> %95, float %93, i32 1
+  %97 = insertelement <4 x float> %96, float %94, i32 2
+  %98 = insertelement <4 x float> %97, float 0.000000e+00, i32 3
+  %99 = insertelement <4 x float> undef, float %37, i32 0
+  %100 = insertelement <4 x float> %99, float %38, i32 1
+  %101 = insertelement <4 x float> %100, float %39, i32 2
+  %102 = insertelement <4 x float> %101, float 0.000000e+00, i32 3
+  %103 = call float @llvm.AMDGPU.dp4(<4 x float> %98, <4 x float> %102)
+  %104 = insertelement <4 x float> undef, float %92, i32 0
+  %105 = insertelement <4 x float> %104, float %93, i32 1
+  %106 = insertelement <4 x float> %105, float %94, i32 2
+  %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 3
+  %108 = insertelement <4 x float> undef, float %40, i32 0
+  %109 = insertelement <4 x float> %108, float %41, i32 1
+  %110 = insertelement <4 x float> %109, float %42, i32 2
+  %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 3
+  %112 = call float @llvm.AMDGPU.dp4(<4 x float> %107, <4 x float> %111)
+  %113 = fsub float -0.000000e+00, %92
+  %114 = fsub float -0.000000e+00, %93
+  %115 = fsub float -0.000000e+00, %94
+  %116 = insertelement <4 x float> undef, float %34, i32 0
+  %117 = insertelement <4 x float> %116, float %35, i32 1
+  %118 = insertelement <4 x float> %117, float %36, i32 2
+  %119 = insertelement <4 x float> %118, float 0.000000e+00, i32 3
+  %120 = insertelement <4 x float> undef, float %113, i32 0
+  %121 = insertelement <4 x float> %120, float %114, i32 1
+  %122 = insertelement <4 x float> %121, float %115, i32 2
+  %123 = insertelement <4 x float> %122, float 0.000000e+00, i32 3
+  %124 = call float @llvm.AMDGPU.dp4(<4 x float> %119, <4 x float> %123)
+  %125 = fdiv float 1.000000e+00, %124
+  %126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %127 = extractelement <4 x float> %126, i32 0
+  %128 = fmul float %127, %125
+  %129 = fmul float %103, %128
+  %130 = fmul float %112, %128
+  %131 = bitcast float %. to i32
+  %132 = sitofp i32 %131 to float
+  %133 = fdiv float 1.000000e+00, %132
+  %134 = bitcast float %. to i32
+  %135 = add i32 %134, -1
+  %136 = bitcast i32 %135 to float
+  %137 = bitcast float %136 to i32
+  br label %LOOP
+
+ENDIF136:                                         ; preds = %main_body, %ENDIF154
+  %temp68.1 = phi float [ %600, %ENDIF154 ], [ 0.000000e+00, %main_body ]
+  %temp69.0 = phi float [ %602, %ENDIF154 ], [ 0.000000e+00, %main_body ]
+  %temp70.0 = phi float [ %604, %ENDIF154 ], [ 1.000000e+00, %main_body ]
+  %138 = fmul float %26, 0x3F847AE140000000
+  %139 = fmul float %27, 0x3F847AE140000000
+  %140 = fmul float %28, 0x3F847AE140000000
+  %141 = insertelement <4 x float> undef, float %138, i32 0
+  %142 = insertelement <4 x float> %141, float %139, i32 1
+  %143 = insertelement <4 x float> %142, float %140, i32 2
+  %144 = insertelement <4 x float> %143, float 0.000000e+00, i32 3
+  %145 = extractelement <4 x float> %144, i32 0
+  %146 = extractelement <4 x float> %144, i32 1
+  %147 = extractelement <4 x float> %144, i32 2
+  %148 = extractelement <4 x float> %144, i32 3
+  %149 = insertelement <4 x float> undef, float %145, i32 0
+  %150 = insertelement <4 x float> %149, float %146, i32 1
+  %151 = insertelement <4 x float> %150, float %147, i32 2
+  %152 = insertelement <4 x float> %151, float %148, i32 3
+  %153 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %152, i32 16, i32 0, i32 3)
+  %154 = extractelement <4 x float> %153, i32 0
+  %155 = extractelement <4 x float> %153, i32 1
+  %156 = extractelement <4 x float> %153, i32 2
+  %157 = extractelement <4 x float> %153, i32 3
+  %158 = fmul float %26, 0x3F45A07B40000000
+  %159 = fmul float %27, 0x3F45A07B40000000
+  %160 = fmul float %28, 0x3F45A07B40000000
+  %161 = insertelement <4 x float> undef, float %158, i32 0
+  %162 = insertelement <4 x float> %161, float %159, i32 1
+  %163 = insertelement <4 x float> %162, float %160, i32 2
+  %164 = insertelement <4 x float> %163, float 0.000000e+00, i32 3
+  %165 = extractelement <4 x float> %164, i32 0
+  %166 = extractelement <4 x float> %164, i32 1
+  %167 = extractelement <4 x float> %164, i32 2
+  %168 = extractelement <4 x float> %164, i32 3
+  %169 = insertelement <4 x float> undef, float %165, i32 0
+  %170 = insertelement <4 x float> %169, float %166, i32 1
+  %171 = insertelement <4 x float> %170, float %167, i32 2
+  %172 = insertelement <4 x float> %171, float %168, i32 3
+  %173 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %172, i32 16, i32 0, i32 3)
+  %174 = extractelement <4 x float> %173, i32 0
+  %175 = extractelement <4 x float> %173, i32 1
+  %176 = extractelement <4 x float> %173, i32 2
+  %177 = extractelement <4 x float> %173, i32 3
+  %178 = fmul float %176, 3.000000e+03
+  %179 = fadd float %178, %28
+  %180 = fdiv float 1.000000e+00, %33
+  %181 = fmul float %32, %180
+  %182 = call float @fabs(float %181)
+  %183 = fmul float %174, 0x3FD99999A0000000
+  %184 = fadd float %183, 0x3FAEB851E0000000
+  %185 = fmul float %175, 0x3FE3333340000000
+  %186 = fadd float %185, %184
+  %187 = fmul float %176, 2.000000e+00
+  %188 = fadd float %187, %186
+  %189 = fmul float %177, 4.000000e+00
+  %190 = fadd float %189, %188
+  %191 = fmul float %154, 0x3FB99999A0000000
+  %192 = fadd float %191, %190
+  %193 = fmul float %155, 0x3FD99999A0000000
+  %194 = fadd float %193, %192
+  %195 = fmul float %156, 0x3FE99999A0000000
+  %196 = fadd float %195, %194
+  %197 = fmul float %157, 0x4000CCCCC0000000
+  %198 = fadd float %197, %196
+  %199 = fmul float 0xBE5EFB4CC0000000, %182
+  %200 = fmul float %199, %182
+  %201 = call float @llvm.AMDIL.exp.(float %200)
+  %202 = call float @llvm.AMDGPU.lrp(float %201, float %198, float 0x3FA99999A0000000)
+  %203 = fadd float %202, 0x3FF4CCCCC0000000
+  %204 = fmul float %203, 0x3FE1C71C80000000
+  %205 = call float @llvm.AMDIL.clamp.(float %204, float 0.000000e+00, float 1.000000e+00)
+  %206 = fadd float %202, 0x3FF4CCCCC0000000
+  %207 = fmul float %206, 0x3FE1C71C80000000
+  %208 = call float @llvm.AMDIL.clamp.(float %207, float 0.000000e+00, float 1.000000e+00)
+  %209 = fadd float %202, 2.000000e+00
+  %210 = fmul float %209, 0x3FD611A7A0000000
+  %211 = call float @llvm.AMDIL.clamp.(float %210, float 0.000000e+00, float 1.000000e+00)
+  %212 = fmul float 2.000000e+00, %205
+  %213 = fsub float -0.000000e+00, %212
+  %214 = fadd float 3.000000e+00, %213
+  %215 = fmul float %205, %214
+  %216 = fmul float %205, %215
+  %217 = fmul float 2.000000e+00, %208
+  %218 = fsub float -0.000000e+00, %217
+  %219 = fadd float 3.000000e+00, %218
+  %220 = fmul float %208, %219
+  %221 = fmul float %208, %220
+  %222 = fmul float 2.000000e+00, %211
+  %223 = fsub float -0.000000e+00, %222
+  %224 = fadd float 3.000000e+00, %223
+  %225 = fmul float %211, %224
+  %226 = fmul float %211, %225
+  %227 = fmul float %26, 0x3F368B5CC0000000
+  %228 = fmul float %27, 0x3F368B5CC0000000
+  %229 = insertelement <4 x float> undef, float %227, i32 0
+  %230 = insertelement <4 x float> %229, float %228, i32 1
+  %231 = insertelement <4 x float> %230, float 0.000000e+00, i32 2
+  %232 = insertelement <4 x float> %231, float 0.000000e+00, i32 3
+  %233 = extractelement <4 x float> %232, i32 0
+  %234 = extractelement <4 x float> %232, i32 1
+  %235 = insertelement <4 x float> undef, float %233, i32 0
+  %236 = insertelement <4 x float> %235, float %234, i32 1
+  %237 = insertelement <4 x float> %236, float undef, i32 2
+  %238 = insertelement <4 x float> %237, float undef, i32 3
+  %239 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %238, i32 17, i32 1, i32 2)
+  %240 = extractelement <4 x float> %239, i32 0
+  %241 = insertelement <4 x float> undef, float %240, i32 0
+  %242 = insertelement <4 x float> %241, float %228, i32 1
+  %243 = insertelement <4 x float> %242, float 0.000000e+00, i32 2
+  %244 = insertelement <4 x float> %243, float 0.000000e+00, i32 3
+  %245 = extractelement <4 x float> %244, i32 0
+  %246 = insertelement <4 x float> undef, float %245, i32 0
+  %247 = insertelement <4 x float> %246, float undef, i32 1
+  %248 = insertelement <4 x float> %247, float undef, i32 2
+  %249 = insertelement <4 x float> %248, float undef, i32 3
+  %250 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %249, i32 18, i32 2, i32 1)
+  %251 = extractelement <4 x float> %250, i32 0
+  %252 = extractelement <4 x float> %250, i32 1
+  %253 = extractelement <4 x float> %250, i32 2
+  %254 = extractelement <4 x float> %250, i32 3
+  %255 = fmul float %251, %216
+  %256 = fmul float %252, %221
+  %257 = fmul float %253, %226
+  %258 = fmul float %254, 0.000000e+00
+  %259 = fadd float %202, 0x3FF4CCCCC0000000
+  %260 = fmul float %259, 0x3FE1C71C80000000
+  %261 = call float @llvm.AMDIL.clamp.(float %260, float 0.000000e+00, float 1.000000e+00)
+  %262 = fadd float %202, 0x3FF4CCCCC0000000
+  %263 = fmul float %262, 0x3FE1C71C80000000
+  %264 = call float @llvm.AMDIL.clamp.(float %263, float 0.000000e+00, float 1.000000e+00)
+  %265 = fadd float %202, 2.000000e+00
+  %266 = fmul float %265, 0x3FD611A7A0000000
+  %267 = call float @llvm.AMDIL.clamp.(float %266, float 0.000000e+00, float 1.000000e+00)
+  %268 = fmul float 2.000000e+00, %261
+  %269 = fsub float -0.000000e+00, %268
+  %270 = fadd float 3.000000e+00, %269
+  %271 = fmul float %261, %270
+  %272 = fmul float %261, %271
+  %273 = fmul float 2.000000e+00, %264
+  %274 = fsub float -0.000000e+00, %273
+  %275 = fadd float 3.000000e+00, %274
+  %276 = fmul float %264, %275
+  %277 = fmul float %264, %276
+  %278 = fmul float 2.000000e+00, %267
+  %279 = fsub float -0.000000e+00, %278
+  %280 = fadd float 3.000000e+00, %279
+  %281 = fmul float %267, %280
+  %282 = fmul float %267, %281
+  %283 = fmul float %26, 0x3F22DFD6A0000000
+  %284 = fmul float %27, 0x3F22DFD6A0000000
+  %285 = insertelement <4 x float> undef, float %283, i32 0
+  %286 = insertelement <4 x float> %285, float %284, i32 1
+  %287 = insertelement <4 x float> %286, float 0.000000e+00, i32 2
+  %288 = insertelement <4 x float> %287, float 0.000000e+00, i32 3
+  %289 = extractelement <4 x float> %288, i32 0
+  %290 = extractelement <4 x float> %288, i32 1
+  %291 = insertelement <4 x float> undef, float %289, i32 0
+  %292 = insertelement <4 x float> %291, float %290, i32 1
+  %293 = insertelement <4 x float> %292, float undef, i32 2
+  %294 = insertelement <4 x float> %293, float undef, i32 3
+  %295 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %294, i32 19, i32 3, i32 2)
+  %296 = extractelement <4 x float> %295, i32 0
+  %297 = extractelement <4 x float> %295, i32 1
+  %298 = extractelement <4 x float> %295, i32 2
+  %299 = extractelement <4 x float> %295, i32 3
+  %300 = fmul float %296, %272
+  %301 = fmul float %297, %277
+  %302 = fmul float %298, %282
+  %303 = fmul float %299, 0.000000e+00
+  %304 = fmul float %temp68.1, %37
+  %305 = fmul float %temp68.1, %38
+  %306 = fmul float %temp68.1, %39
+  %307 = fmul float %temp69.0, %40
+  %308 = fadd float %307, %304
+  %309 = fmul float %temp69.0, %41
+  %310 = fadd float %309, %305
+  %311 = fmul float %temp69.0, %42
+  %312 = fadd float %311, %306
+  %313 = fmul float %temp70.0, %34
+  %314 = fadd float %313, %308
+  %315 = fmul float %temp70.0, %35
+  %316 = fadd float %315, %310
+  %317 = fmul float %temp70.0, %36
+  %318 = fadd float %317, %312
+  %319 = insertelement <4 x float> undef, float %314, i32 0
+  %320 = insertelement <4 x float> %319, float %316, i32 1
+  %321 = insertelement <4 x float> %320, float %318, i32 2
+  %322 = insertelement <4 x float> %321, float 0.000000e+00, i32 3
+  %323 = insertelement <4 x float> undef, float %314, i32 0
+  %324 = insertelement <4 x float> %323, float %316, i32 1
+  %325 = insertelement <4 x float> %324, float %318, i32 2
+  %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3
+  %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326)
+  %328 = call float @llvm.AMDGPU.rsq.f32(float %327)
+  %329 = fmul float %314, %328
+  %330 = fmul float %316, %328
+  %331 = fmul float %318, %328
+  %332 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %333 = extractelement <4 x float> %332, i32 0
+  %334 = fsub float -0.000000e+00, %333
+  %335 = fadd float 1.000000e+00, %334
+  %336 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %337 = extractelement <4 x float> %336, i32 0
+  %338 = fsub float -0.000000e+00, %337
+  %339 = fadd float 1.000000e+00, %338
+  %340 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %341 = extractelement <4 x float> %340, i32 0
+  %342 = fsub float -0.000000e+00, %341
+  %343 = fadd float 1.000000e+00, %342
+  %344 = fsub float -0.000000e+00, %335
+  %345 = fadd float %202, %344
+  %346 = fsub float -0.000000e+00, %339
+  %347 = fadd float %202, %346
+  %348 = fadd float %347, 0xBFE3333340000000
+  %349 = fsub float -0.000000e+00, %202
+  %350 = fsub float -0.000000e+00, %343
+  %351 = fadd float %349, %350
+  %352 = insertelement <4 x float> undef, float %43, i32 0
+  %353 = insertelement <4 x float> %352, float %44, i32 1
+  %354 = insertelement <4 x float> %353, float %45, i32 2
+  %355 = insertelement <4 x float> %354, float 0.000000e+00, i32 3
+  %356 = insertelement <4 x float> undef, float %43, i32 0
+  %357 = insertelement <4 x float> %356, float %44, i32 1
+  %358 = insertelement <4 x float> %357, float %45, i32 2
+  %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3
+  %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359)
+  %361 = call float @llvm.AMDGPU.rsq.f32(float %360)
+  %362 = fmul float %45, %361
+  %363 = call float @fabs(float %362)
+  %364 = fmul float %176, 0x3FECCCCCC0000000
+  %365 = fadd float %364, %363
+  %366 = fadd float %365, 0xBFEFAE1480000000
+  %367 = fmul float %366, 0xC023FFFFC0000000
+  %368 = call float @llvm.AMDIL.clamp.(float %367, float 0.000000e+00, float 1.000000e+00)
+  %369 = fsub float -0.000000e+00, %335
+  %370 = fadd float %202, %369
+  %371 = fadd float %370, 0x3FBEB851E0000000
+  %372 = fsub float -0.000000e+00, %339
+  %373 = fadd float %202, %372
+  %374 = fadd float %373, 0xBFE0A3D700000000
+  %375 = fsub float -0.000000e+00, %202
+  %376 = fsub float -0.000000e+00, %343
+  %377 = fadd float %375, %376
+  %378 = insertelement <4 x float> undef, float %43, i32 0
+  %379 = insertelement <4 x float> %378, float %44, i32 1
+  %380 = insertelement <4 x float> %379, float %45, i32 2
+  %381 = insertelement <4 x float> %380, float 0.000000e+00, i32 3
+  %382 = insertelement <4 x float> undef, float %43, i32 0
+  %383 = insertelement <4 x float> %382, float %44, i32 1
+  %384 = insertelement <4 x float> %383, float %45, i32 2
+  %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3
+  %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385)
+  %387 = call float @llvm.AMDGPU.rsq.f32(float %386)
+  %388 = fmul float %45, %387
+  %389 = call float @fabs(float %388)
+  %390 = fmul float %176, 0x3FF51EB860000000
+  %391 = fadd float %390, %389
+  %392 = fadd float %391, 0xBFEFAE1480000000
+  %393 = fmul float %392, 0xC0490001A0000000
+  %394 = call float @llvm.AMDIL.clamp.(float %393, float 0.000000e+00, float 1.000000e+00)
+  %395 = fmul float 2.000000e+00, %368
+  %396 = fsub float -0.000000e+00, %395
+  %397 = fadd float 3.000000e+00, %396
+  %398 = fmul float %368, %397
+  %399 = fmul float %368, %398
+  %400 = call float @llvm.AMDGPU.lrp(float %399, float %255, float %345)
+  %401 = call float @llvm.AMDGPU.lrp(float %399, float %256, float %348)
+  %402 = call float @llvm.AMDGPU.lrp(float %399, float %257, float %351)
+  %403 = call float @llvm.AMDGPU.lrp(float %399, float %258, float 0.000000e+00)
+  %404 = fmul float 2.000000e+00, %394
+  %405 = fsub float -0.000000e+00, %404
+  %406 = fadd float 3.000000e+00, %405
+  %407 = fmul float %394, %406
+  %408 = fmul float %394, %407
+  %409 = call float @llvm.AMDGPU.lrp(float %408, float %255, float %371)
+  %410 = call float @llvm.AMDGPU.lrp(float %408, float %256, float %374)
+  %411 = call float @llvm.AMDGPU.lrp(float %408, float %257, float %377)
+  %412 = call float @llvm.AMDGPU.lrp(float %408, float %258, float 0x3FD3333340000000)
+  %413 = fcmp oge float 2.200000e+03, %179
+  %414 = sext i1 %413 to i32
+  %415 = bitcast i32 %414 to float
+  %416 = bitcast float %415 to i32
+  %417 = icmp ne i32 %416, 0
+  br i1 %417, label %IF161, label %ENDIF160
+
+LOOP:                                             ; preds = %ENDIF139, %IF137
+  %temp88.0 = phi float [ 0.000000e+00, %IF137 ], [ %446, %ENDIF139 ]
+  %temp92.0 = phi float [ 1.000000e+00, %IF137 ], [ %.temp92.0, %ENDIF139 ]
+  %temp96.0 = phi float [ 0.000000e+00, %IF137 ], [ %477, %ENDIF139 ]
+  %418 = bitcast float %temp96.0 to i32
+  %419 = icmp sge i32 %418, %137
+  %420 = sext i1 %419 to i32
+  %421 = bitcast i32 %420 to float
+  %422 = bitcast float %421 to i32
+  %423 = icmp ne i32 %422, 0
+  br i1 %423, label %IF140, label %ENDIF139
+
+IF140:                                            ; preds = %LOOP
+  %424 = fmul float %133, 5.000000e-01
+  %425 = fmul float %129, %temp92.0
+  %426 = fadd float %425, %22
+  %427 = fmul float %130, %temp92.0
+  %428 = fadd float %427, %23
+  %429 = insertelement <4 x float> undef, float %426, i32 0
+  %430 = insertelement <4 x float> %429, float %428, i32 1
+  %431 = insertelement <4 x float> %430, float 0.000000e+00, i32 2
+  %432 = insertelement <4 x float> %431, float 0.000000e+00, i32 3
+  %433 = extractelement <4 x float> %432, i32 0
+  %434 = extractelement <4 x float> %432, i32 1
+  %435 = insertelement <4 x float> undef, float %433, i32 0
+  %436 = insertelement <4 x float> %435, float %434, i32 1
+  %437 = insertelement <4 x float> %436, float undef, i32 2
+  %438 = insertelement <4 x float> %437, float undef, i32 3
+  %439 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %438, i32 20, i32 4, i32 2)
+  %440 = extractelement <4 x float> %439, i32 3
+  %441 = fcmp oge float %temp92.0, %440
+  %442 = sext i1 %441 to i32
+  %443 = bitcast i32 %442 to float
+  %444 = bitcast float %443 to i32
+  %445 = icmp ne i32 %444, 0
+  br i1 %445, label %IF146, label %ENDIF145
+
+ENDIF139:                                         ; preds = %LOOP
+  %446 = fadd float %temp88.0, %133
+  %447 = fmul float %129, %446
+  %448 = fadd float %447, %22
+  %449 = fmul float %130, %446
+  %450 = fadd float %449, %23
+  %451 = insertelement <4 x float> undef, float %448, i32 0
+  %452 = insertelement <4 x float> %451, float %450, i32 1
+  %453 = insertelement <4 x float> %452, float 0.000000e+00, i32 2
+  %454 = insertelement <4 x float> %453, float 0.000000e+00, i32 3
+  %455 = extractelement <4 x float> %454, i32 0
+  %456 = extractelement <4 x float> %454, i32 1
+  %457 = insertelement <4 x float> undef, float %455, i32 0
+  %458 = insertelement <4 x float> %457, float %456, i32 1
+  %459 = insertelement <4 x float> %458, float undef, i32 2
+  %460 = insertelement <4 x float> %459, float undef, i32 3
+  %461 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %460, i32 20, i32 4, i32 2)
+  %462 = extractelement <4 x float> %461, i32 3
+  %463 = fcmp olt float 0x3FEFDF3B60000000, %temp92.0
+  %464 = sext i1 %463 to i32
+  %465 = bitcast i32 %464 to float
+  %466 = fcmp oge float %446, %462
+  %467 = sext i1 %466 to i32
+  %468 = bitcast i32 %467 to float
+  %469 = bitcast float %465 to i32
+  %470 = bitcast float %468 to i32
+  %471 = and i32 %469, %470
+  %472 = bitcast i32 %471 to float
+  %473 = bitcast float %472 to i32
+  %474 = icmp ne i32 %473, 0
+  %.temp92.0 = select i1 %474, float %446, float %temp92.0
+  %475 = bitcast float %temp96.0 to i32
+  %476 = add i32 %475, 1
+  %477 = bitcast i32 %476 to float
+  br label %LOOP
+
+IF146:                                            ; preds = %IF140
+  %478 = fmul float 2.000000e+00, %424
+  %479 = fsub float -0.000000e+00, %478
+  %480 = fadd float %temp92.0, %479
+  br label %ENDIF145
+
+ENDIF145:                                         ; preds = %IF140, %IF146
+  %temp88.1 = phi float [ %480, %IF146 ], [ %temp92.0, %IF140 ]
+  %481 = fadd float %temp88.1, %424
+  %482 = fmul float %424, 5.000000e-01
+  %483 = fmul float %129, %481
+  %484 = fadd float %483, %22
+  %485 = fmul float %130, %481
+  %486 = fadd float %485, %23
+  %487 = insertelement <4 x float> undef, float %484, i32 0
+  %488 = insertelement <4 x float> %487, float %486, i32 1
+  %489 = insertelement <4 x float> %488, float 0.000000e+00, i32 2
+  %490 = insertelement <4 x float> %489, float %440, i32 3
+  %491 = extractelement <4 x float> %490, i32 0
+  %492 = extractelement <4 x float> %490, i32 1
+  %493 = insertelement <4 x float> undef, float %491, i32 0
+  %494 = insertelement <4 x float> %493, float %492, i32 1
+  %495 = insertelement <4 x float> %494, float undef, i32 2
+  %496 = insertelement <4 x float> %495, float undef, i32 3
+  %497 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %496, i32 20, i32 4, i32 2)
+  %498 = extractelement <4 x float> %497, i32 3
+  %499 = fcmp oge float %481, %498
+  %500 = sext i1 %499 to i32
+  %501 = bitcast i32 %500 to float
+  %502 = bitcast float %501 to i32
+  %503 = icmp ne i32 %502, 0
+  br i1 %503, label %IF149, label %ENDIF148
+
+IF149:                                            ; preds = %ENDIF145
+  %504 = fmul float 2.000000e+00, %482
+  %505 = fsub float -0.000000e+00, %504
+  %506 = fadd float %481, %505
+  br label %ENDIF148
+
+ENDIF148:                                         ; preds = %ENDIF145, %IF149
+  %temp88.2 = phi float [ %506, %IF149 ], [ %481, %ENDIF145 ]
+  %temp92.2 = phi float [ %481, %IF149 ], [ %temp92.0, %ENDIF145 ]
+  %507 = fadd float %temp88.2, %482
+  %508 = fmul float %482, 5.000000e-01
+  %509 = fmul float %129, %507
+  %510 = fadd float %509, %22
+  %511 = fmul float %130, %507
+  %512 = fadd float %511, %23
+  %513 = insertelement <4 x float> undef, float %510, i32 0
+  %514 = insertelement <4 x float> %513, float %512, i32 1
+  %515 = insertelement <4 x float> %514, float 0.000000e+00, i32 2
+  %516 = insertelement <4 x float> %515, float %498, i32 3
+  %517 = extractelement <4 x float> %516, i32 0
+  %518 = extractelement <4 x float> %516, i32 1
+  %519 = insertelement <4 x float> undef, float %517, i32 0
+  %520 = insertelement <4 x float> %519, float %518, i32 1
+  %521 = insertelement <4 x float> %520, float undef, i32 2
+  %522 = insertelement <4 x float> %521, float undef, i32 3
+  %523 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %522, i32 20, i32 4, i32 2)
+  %524 = extractelement <4 x float> %523, i32 3
+  %525 = fcmp oge float %507, %524
+  %526 = sext i1 %525 to i32
+  %527 = bitcast i32 %526 to float
+  %528 = bitcast float %527 to i32
+  %529 = icmp ne i32 %528, 0
+  br i1 %529, label %IF152, label %ENDIF151
+
+IF152:                                            ; preds = %ENDIF148
+  %530 = fmul float 2.000000e+00, %508
+  %531 = fsub float -0.000000e+00, %530
+  %532 = fadd float %507, %531
+  br label %ENDIF151
+
+ENDIF151:                                         ; preds = %ENDIF148, %IF152
+  %temp88.3 = phi float [ %532, %IF152 ], [ %507, %ENDIF148 ]
+  %temp92.3 = phi float [ %507, %IF152 ], [ %temp92.2, %ENDIF148 ]
+  %533 = fadd float %temp88.3, %508
+  %534 = fmul float %508, 5.000000e-01
+  %535 = fmul float %129, %533
+  %536 = fadd float %535, %22
+  %537 = fmul float %130, %533
+  %538 = fadd float %537, %23
+  %539 = insertelement <4 x float> undef, float %536, i32 0
+  %540 = insertelement <4 x float> %539, float %538, i32 1
+  %541 = insertelement <4 x float> %540, float 0.000000e+00, i32 2
+  %542 = insertelement <4 x float> %541, float %524, i32 3
+  %543 = extractelement <4 x float> %542, i32 0
+  %544 = extractelement <4 x float> %542, i32 1
+  %545 = insertelement <4 x float> undef, float %543, i32 0
+  %546 = insertelement <4 x float> %545, float %544, i32 1
+  %547 = insertelement <4 x float> %546, float undef, i32 2
+  %548 = insertelement <4 x float> %547, float undef, i32 3
+  %549 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %548, i32 20, i32 4, i32 2)
+  %550 = extractelement <4 x float> %549, i32 3
+  %551 = fcmp oge float %533, %550
+  %552 = sext i1 %551 to i32
+  %553 = bitcast i32 %552 to float
+  %554 = bitcast float %553 to i32
+  %555 = icmp ne i32 %554, 0
+  br i1 %555, label %IF155, label %ENDIF154
+
+IF155:                                            ; preds = %ENDIF151
+  %556 = fmul float 2.000000e+00, %534
+  %557 = fsub float -0.000000e+00, %556
+  %558 = fadd float %533, %557
+  br label %ENDIF154
+
+ENDIF154:                                         ; preds = %ENDIF151, %IF155
+  %temp88.4 = phi float [ %558, %IF155 ], [ %533, %ENDIF151 ]
+  %temp92.4 = phi float [ %533, %IF155 ], [ %temp92.3, %ENDIF151 ]
+  %559 = fadd float %temp88.4, %534
+  %560 = fmul float %129, %559
+  %561 = fadd float %560, %22
+  %562 = fmul float %130, %559
+  %563 = fadd float %562, %23
+  %564 = insertelement <4 x float> undef, float %561, i32 0
+  %565 = insertelement <4 x float> %564, float %563, i32 1
+  %566 = insertelement <4 x float> %565, float 0.000000e+00, i32 2
+  %567 = insertelement <4 x float> %566, float %550, i32 3
+  %568 = extractelement <4 x float> %567, i32 0
+  %569 = extractelement <4 x float> %567, i32 1
+  %570 = insertelement <4 x float> undef, float %568, i32 0
+  %571 = insertelement <4 x float> %570, float %569, i32 1
+  %572 = insertelement <4 x float> %571, float undef, i32 2
+  %573 = insertelement <4 x float> %572, float undef, i32 3
+  %574 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %573, i32 20, i32 4, i32 2)
+  %575 = extractelement <4 x float> %574, i32 3
+  %576 = fcmp oge float %559, %575
+  %577 = sext i1 %576 to i32
+  %578 = bitcast i32 %577 to float
+  %579 = bitcast float %578 to i32
+  %580 = icmp ne i32 %579, 0
+  %.temp92.4 = select i1 %580, float %559, float %temp92.4
+  %581 = fmul float %129, %.temp92.4
+  %582 = fadd float %581, %22
+  %583 = fmul float %130, %.temp92.4
+  %584 = fadd float %583, %23
+  %585 = insertelement <4 x float> undef, float %582, i32 0
+  %586 = insertelement <4 x float> %585, float %584, i32 1
+  %587 = insertelement <4 x float> %586, float 0.000000e+00, i32 2
+  %588 = insertelement <4 x float> %587, float %575, i32 3
+  %589 = extractelement <4 x float> %588, i32 0
+  %590 = extractelement <4 x float> %588, i32 1
+  %591 = insertelement <4 x float> undef, float %589, i32 0
+  %592 = insertelement <4 x float> %591, float %590, i32 1
+  %593 = insertelement <4 x float> %592, float undef, i32 2
+  %594 = insertelement <4 x float> %593, float undef, i32 3
+  %595 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %594, i32 20, i32 4, i32 2)
+  %596 = extractelement <4 x float> %595, i32 0
+  %597 = extractelement <4 x float> %595, i32 1
+  %598 = extractelement <4 x float> %595, i32 2
+  %599 = fmul float %596, 2.000000e+00
+  %600 = fadd float %599, -1.000000e+00
+  %601 = fmul float %597, 2.000000e+00
+  %602 = fadd float %601, -1.000000e+00
+  %603 = fmul float %598, 2.000000e+00
+  %604 = fadd float %603, -1.000000e+00
+  br label %ENDIF136
+
+IF161:                                            ; preds = %ENDIF136
+  %605 = fmul float %202, 0x3FB99999A0000000
+  %606 = fcmp uge float 0x3FE4CCCCC0000000, %605
+  %607 = select i1 %606, float 0x3FE4CCCCC0000000, float %605
+  %608 = fcmp uge float %607, 5.000000e-01
+  %609 = select i1 %608, float 5.000000e-01, float %607
+  %610 = call float @llvm.AMDGPU.lrp(float %609, float %400, float %300)
+  %611 = call float @llvm.AMDGPU.lrp(float %609, float %401, float %301)
+  %612 = call float @llvm.AMDGPU.lrp(float %609, float %402, float %302)
+  %613 = call float @llvm.AMDGPU.lrp(float %609, float %403, float %303)
+  %614 = insertelement <4 x float> undef, float %329, i32 0
+  %615 = insertelement <4 x float> %614, float %330, i32 1
+  %616 = insertelement <4 x float> %615, float %331, i32 2
+  %617 = insertelement <4 x float> %616, float 0.000000e+00, i32 3
+  %618 = insertelement <4 x float> undef, float %63, i32 0
+  %619 = insertelement <4 x float> %618, float %65, i32 1
+  %620 = insertelement <4 x float> %619, float %67, i32 2
+  %621 = insertelement <4 x float> %620, float 0.000000e+00, i32 3
+  %622 = call float @llvm.AMDGPU.dp4(<4 x float> %617, <4 x float> %621)
+  %623 = fcmp uge float 0x3FE6666660000000, %622
+  %624 = select i1 %623, float 0x3FE6666660000000, float %622
+  %625 = fmul float %8, %624
+  %626 = fmul float %13, %624
+  %627 = fmul float %18, %624
+  %628 = insertelement <4 x float> undef, float %34, i32 0
+  %629 = insertelement <4 x float> %628, float %35, i32 1
+  %630 = insertelement <4 x float> %629, float %36, i32 2
+  %631 = insertelement <4 x float> %630, float 0.000000e+00, i32 3
+  %632 = insertelement <4 x float> undef, float %63, i32 0
+  %633 = insertelement <4 x float> %632, float %65, i32 1
+  %634 = insertelement <4 x float> %633, float %67, i32 2
+  %635 = insertelement <4 x float> %634, float 0.000000e+00, i32 3
+  %636 = call float @llvm.AMDGPU.dp4(<4 x float> %631, <4 x float> %635)
+  %637 = fcmp uge float 0x3FECCCCCC0000000, %636
+  %638 = select i1 %637, float 0x3FECCCCCC0000000, float %636
+  %639 = fmul float %625, %638
+  %640 = fmul float %626, %638
+  %641 = fmul float %627, %638
+  br label %ENDIF160
+
+ENDIF160:                                         ; preds = %ENDIF136, %IF161
+  %temp84.0 = phi float [ %610, %IF161 ], [ %255, %ENDIF136 ]
+  %temp85.0 = phi float [ %611, %IF161 ], [ %256, %ENDIF136 ]
+  %temp86.0 = phi float [ %612, %IF161 ], [ %257, %ENDIF136 ]
+  %temp87.0 = phi float [ %613, %IF161 ], [ %258, %ENDIF136 ]
+  %temp92.6 = phi float [ %639, %IF161 ], [ %415, %ENDIF136 ]
+  %temp93.0 = phi float [ %640, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
+  %temp94.0 = phi float [ %641, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
+  %642 = fcmp olt float 2.200000e+03, %179
+  %643 = sext i1 %642 to i32
+  %644 = bitcast i32 %643 to float
+  %645 = fcmp olt float %179, 2.300000e+03
+  %646 = sext i1 %645 to i32
+  %647 = bitcast i32 %646 to float
+  %648 = bitcast float %644 to i32
+  %649 = bitcast float %647 to i32
+  %650 = and i32 %648, %649
+  %651 = bitcast i32 %650 to float
+  %652 = bitcast float %651 to i32
+  %653 = icmp ne i32 %652, 0
+  br i1 %653, label %IF164, label %ENDIF163
+
+IF164:                                            ; preds = %ENDIF160
+  %654 = fmul float %202, 5.000000e-01
+  %655 = fcmp uge float 0x3FE4CCCCC0000000, %654
+  %656 = select i1 %655, float 0x3FE4CCCCC0000000, float %654
+  %657 = fcmp uge float %656, 0x3FD6666660000000
+  %658 = select i1 %657, float 0x3FD6666660000000, float %656
+  %659 = call float @llvm.AMDGPU.lrp(float %658, float %400, float %300)
+  %660 = call float @llvm.AMDGPU.lrp(float %658, float %401, float %301)
+  %661 = call float @llvm.AMDGPU.lrp(float %658, float %402, float %302)
+  %662 = call float @llvm.AMDGPU.lrp(float %658, float %403, float %303)
+  %663 = insertelement <4 x float> undef, float %329, i32 0
+  %664 = insertelement <4 x float> %663, float %330, i32 1
+  %665 = insertelement <4 x float> %664, float %331, i32 2
+  %666 = insertelement <4 x float> %665, float 0.000000e+00, i32 3
+  %667 = insertelement <4 x float> undef, float %63, i32 0
+  %668 = insertelement <4 x float> %667, float %65, i32 1
+  %669 = insertelement <4 x float> %668, float %67, i32 2
+  %670 = insertelement <4 x float> %669, float 0.000000e+00, i32 3
+  %671 = call float @llvm.AMDGPU.dp4(<4 x float> %666, <4 x float> %670)
+  %672 = fcmp uge float 0x3FE6666660000000, %671
+  %673 = select i1 %672, float 0x3FE6666660000000, float %671
+  %674 = fmul float %8, %673
+  %675 = fmul float %13, %673
+  %676 = fmul float %18, %673
+  %677 = insertelement <4 x float> undef, float %34, i32 0
+  %678 = insertelement <4 x float> %677, float %35, i32 1
+  %679 = insertelement <4 x float> %678, float %36, i32 2
+  %680 = insertelement <4 x float> %679, float 0.000000e+00, i32 3
+  %681 = insertelement <4 x float> undef, float %63, i32 0
+  %682 = insertelement <4 x float> %681, float %65, i32 1
+  %683 = insertelement <4 x float> %682, float %67, i32 2
+  %684 = insertelement <4 x float> %683, float 0.000000e+00, i32 3
+  %685 = call float @llvm.AMDGPU.dp4(<4 x float> %680, <4 x float> %684)
+  %686 = fcmp uge float 0x3FECCCCCC0000000, %685
+  %687 = select i1 %686, float 0x3FECCCCCC0000000, float %685
+  %688 = fmul float %674, %687
+  %689 = fmul float %675, %687
+  %690 = fmul float %676, %687
+  br label %ENDIF163
+
+ENDIF163:                                         ; preds = %ENDIF160, %IF164
+  %temp84.1 = phi float [ %659, %IF164 ], [ %temp84.0, %ENDIF160 ]
+  %temp85.1 = phi float [ %660, %IF164 ], [ %temp85.0, %ENDIF160 ]
+  %temp86.1 = phi float [ %661, %IF164 ], [ %temp86.0, %ENDIF160 ]
+  %temp87.1 = phi float [ %662, %IF164 ], [ %temp87.0, %ENDIF160 ]
+  %temp92.7 = phi float [ %688, %IF164 ], [ %temp92.6, %ENDIF160 ]
+  %temp93.1 = phi float [ %689, %IF164 ], [ %temp93.0, %ENDIF160 ]
+  %temp94.1 = phi float [ %690, %IF164 ], [ %temp94.0, %ENDIF160 ]
+  %691 = fcmp oge float %179, 2.300000e+03
+  %692 = sext i1 %691 to i32
+  %693 = bitcast i32 %692 to float
+  %694 = fcmp olt float %179, 2.480000e+03
+  %695 = sext i1 %694 to i32
+  %696 = bitcast i32 %695 to float
+  %697 = bitcast float %693 to i32
+  %698 = bitcast float %696 to i32
+  %699 = and i32 %697, %698
+  %700 = bitcast i32 %699 to float
+  %701 = bitcast float %700 to i32
+  %702 = icmp ne i32 %701, 0
+  br i1 %702, label %IF167, label %ENDIF166
+
+IF167:                                            ; preds = %ENDIF163
+  %703 = fmul float %202, 5.000000e-01
+  %704 = fcmp uge float 0x3FE4CCCCC0000000, %703
+  %705 = select i1 %704, float 0x3FE4CCCCC0000000, float %703
+  %706 = fcmp uge float %705, 0x3FD3333340000000
+  %707 = select i1 %706, float 0x3FD3333340000000, float %705
+  %708 = call float @llvm.AMDGPU.lrp(float %707, float %409, float %300)
+  %709 = call float @llvm.AMDGPU.lrp(float %707, float %410, float %301)
+  %710 = call float @llvm.AMDGPU.lrp(float %707, float %411, float %302)
+  %711 = call float @llvm.AMDGPU.lrp(float %707, float %412, float %303)
+  %712 = insertelement <4 x float> undef, float %329, i32 0
+  %713 = insertelement <4 x float> %712, float %330, i32 1
+  %714 = insertelement <4 x float> %713, float %331, i32 2
+  %715 = insertelement <4 x float> %714, float 0.000000e+00, i32 3
+  %716 = insertelement <4 x float> undef, float %63, i32 0
+  %717 = insertelement <4 x float> %716, float %65, i32 1
+  %718 = insertelement <4 x float> %717, float %67, i32 2
+  %719 = insertelement <4 x float> %718, float 0.000000e+00, i32 3
+  %720 = call float @llvm.AMDGPU.dp4(<4 x float> %715, <4 x float> %719)
+  %721 = fcmp uge float 0x3FEB333340000000, %720
+  %722 = select i1 %721, float 0x3FEB333340000000, float %720
+  %723 = fmul float %8, %722
+  %724 = fmul float %13, %722
+  %725 = fmul float %18, %722
+  %726 = insertelement <4 x float> undef, float %34, i32 0
+  %727 = insertelement <4 x float> %726, float %35, i32 1
+  %728 = insertelement <4 x float> %727, float %36, i32 2
+  %729 = insertelement <4 x float> %728, float 0.000000e+00, i32 3
+  %730 = insertelement <4 x float> undef, float %63, i32 0
+  %731 = insertelement <4 x float> %730, float %65, i32 1
+  %732 = insertelement <4 x float> %731, float %67, i32 2
+  %733 = insertelement <4 x float> %732, float 0.000000e+00, i32 3
+  %734 = call float @llvm.AMDGPU.dp4(<4 x float> %729, <4 x float> %733)
+  %735 = fcmp uge float 0x3FECCCCCC0000000, %734
+  %736 = select i1 %735, float 0x3FECCCCCC0000000, float %734
+  %737 = fmul float %723, %736
+  %738 = fmul float %724, %736
+  %739 = fmul float %725, %736
+  br label %ENDIF166
+
+ENDIF166:                                         ; preds = %ENDIF163, %IF167
+  %temp84.2 = phi float [ %708, %IF167 ], [ %temp84.1, %ENDIF163 ]
+  %temp85.2 = phi float [ %709, %IF167 ], [ %temp85.1, %ENDIF163 ]
+  %temp86.2 = phi float [ %710, %IF167 ], [ %temp86.1, %ENDIF163 ]
+  %temp87.2 = phi float [ %711, %IF167 ], [ %temp87.1, %ENDIF163 ]
+  %temp92.8 = phi float [ %737, %IF167 ], [ %temp92.7, %ENDIF163 ]
+  %temp93.2 = phi float [ %738, %IF167 ], [ %temp93.1, %ENDIF163 ]
+  %temp94.2 = phi float [ %739, %IF167 ], [ %temp94.1, %ENDIF163 ]
+  %740 = fcmp oge float %179, 2.480000e+03
+  %741 = sext i1 %740 to i32
+  %742 = bitcast i32 %741 to float
+  %743 = fcmp olt float %179, 2.530000e+03
+  %744 = sext i1 %743 to i32
+  %745 = bitcast i32 %744 to float
+  %746 = bitcast float %742 to i32
+  %747 = bitcast float %745 to i32
+  %748 = and i32 %746, %747
+  %749 = bitcast i32 %748 to float
+  %750 = bitcast float %749 to i32
+  %751 = icmp ne i32 %750, 0
+  br i1 %751, label %IF170, label %ENDIF169
+
+IF170:                                            ; preds = %ENDIF166
+  %752 = fmul float %202, 5.000000e-01
+  %753 = fcmp uge float 0x3FE4CCCCC0000000, %752
+  %754 = select i1 %753, float 0x3FE4CCCCC0000000, float %752
+  %755 = fcmp uge float %754, 0x3FC99999A0000000
+  %756 = select i1 %755, float 0x3FC99999A0000000, float %754
+  %757 = call float @llvm.AMDGPU.lrp(float %756, float %409, float %300)
+  %758 = call float @llvm.AMDGPU.lrp(float %756, float %410, float %301)
+  %759 = call float @llvm.AMDGPU.lrp(float %756, float %411, float %302)
+  %760 = call float @llvm.AMDGPU.lrp(float %756, float %412, float %303)
+  %761 = insertelement <4 x float> undef, float %329, i32 0
+  %762 = insertelement <4 x float> %761, float %330, i32 1
+  %763 = insertelement <4 x float> %762, float %331, i32 2
+  %764 = insertelement <4 x float> %763, float 0.000000e+00, i32 3
+  %765 = insertelement <4 x float> undef, float %63, i32 0
+  %766 = insertelement <4 x float> %765, float %65, i32 1
+  %767 = insertelement <4 x float> %766, float %67, i32 2
+  %768 = insertelement <4 x float> %767, float 0.000000e+00, i32 3
+  %769 = call float @llvm.AMDGPU.dp4(<4 x float> %764, <4 x float> %768)
+  %770 = fcmp uge float 0x3FEB333340000000, %769
+  %771 = select i1 %770, float 0x3FEB333340000000, float %769
+  %772 = fmul float %8, %771
+  %773 = fmul float %13, %771
+  %774 = fmul float %18, %771
+  %775 = insertelement <4 x float> undef, float %34, i32 0
+  %776 = insertelement <4 x float> %775, float %35, i32 1
+  %777 = insertelement <4 x float> %776, float %36, i32 2
+  %778 = insertelement <4 x float> %777, float 0.000000e+00, i32 3
+  %779 = insertelement <4 x float> undef, float %63, i32 0
+  %780 = insertelement <4 x float> %779, float %65, i32 1
+  %781 = insertelement <4 x float> %780, float %67, i32 2
+  %782 = insertelement <4 x float> %781, float 0.000000e+00, i32 3
+  %783 = call float @llvm.AMDGPU.dp4(<4 x float> %778, <4 x float> %782)
+  %784 = fcmp uge float 0x3FECCCCCC0000000, %783
+  %785 = select i1 %784, float 0x3FECCCCCC0000000, float %783
+  %786 = fmul float %772, %785
+  %787 = fmul float %773, %785
+  %788 = fmul float %774, %785
+  br label %ENDIF169
+
+ENDIF169:                                         ; preds = %ENDIF166, %IF170
+  %temp84.3 = phi float [ %757, %IF170 ], [ %temp84.2, %ENDIF166 ]
+  %temp85.3 = phi float [ %758, %IF170 ], [ %temp85.2, %ENDIF166 ]
+  %temp86.3 = phi float [ %759, %IF170 ], [ %temp86.2, %ENDIF166 ]
+  %temp87.3 = phi float [ %760, %IF170 ], [ %temp87.2, %ENDIF166 ]
+  %temp92.9 = phi float [ %786, %IF170 ], [ %temp92.8, %ENDIF166 ]
+  %temp93.3 = phi float [ %787, %IF170 ], [ %temp93.2, %ENDIF166 ]
+  %temp94.3 = phi float [ %788, %IF170 ], [ %temp94.2, %ENDIF166 ]
+  %789 = fcmp oge float %179, 2.530000e+03
+  %790 = sext i1 %789 to i32
+  %791 = bitcast i32 %790 to float
+  %792 = fcmp olt float %179, 2.670000e+03
+  %793 = sext i1 %792 to i32
+  %794 = bitcast i32 %793 to float
+  %795 = bitcast float %791 to i32
+  %796 = bitcast float %794 to i32
+  %797 = and i32 %795, %796
+  %798 = bitcast i32 %797 to float
+  %799 = bitcast float %798 to i32
+  %800 = icmp ne i32 %799, 0
+  br i1 %800, label %IF173, label %ENDIF172
+
+IF173:                                            ; preds = %ENDIF169
+  %801 = fmul float %202, 5.000000e-01
+  %802 = fcmp uge float 0x3FE4CCCCC0000000, %801
+  %803 = select i1 %802, float 0x3FE4CCCCC0000000, float %801
+  %804 = fcmp uge float %803, 0x3FB99999A0000000
+  %805 = select i1 %804, float 0x3FB99999A0000000, float %803
+  %806 = call float @llvm.AMDGPU.lrp(float %805, float %400, float %300)
+  %807 = call float @llvm.AMDGPU.lrp(float %805, float %401, float %301)
+  %808 = call float @llvm.AMDGPU.lrp(float %805, float %402, float %302)
+  %809 = call float @llvm.AMDGPU.lrp(float %805, float %403, float %303)
+  %810 = insertelement <4 x float> undef, float %329, i32 0
+  %811 = insertelement <4 x float> %810, float %330, i32 1
+  %812 = insertelement <4 x float> %811, float %331, i32 2
+  %813 = insertelement <4 x float> %812, float 0.000000e+00, i32 3
+  %814 = insertelement <4 x float> undef, float %63, i32 0
+  %815 = insertelement <4 x float> %814, float %65, i32 1
+  %816 = insertelement <4 x float> %815, float %67, i32 2
+  %817 = insertelement <4 x float> %816, float 0.000000e+00, i32 3
+  %818 = call float @llvm.AMDGPU.dp4(<4 x float> %813, <4 x float> %817)
+  %819 = fcmp uge float 0x3FEB333340000000, %818
+  %820 = select i1 %819, float 0x3FEB333340000000, float %818
+  %821 = fmul float %8, %820
+  %822 = fmul float %13, %820
+  %823 = fmul float %18, %820
+  %824 = insertelement <4 x float> undef, float %34, i32 0
+  %825 = insertelement <4 x float> %824, float %35, i32 1
+  %826 = insertelement <4 x float> %825, float %36, i32 2
+  %827 = insertelement <4 x float> %826, float 0.000000e+00, i32 3
+  %828 = insertelement <4 x float> undef, float %63, i32 0
+  %829 = insertelement <4 x float> %828, float %65, i32 1
+  %830 = insertelement <4 x float> %829, float %67, i32 2
+  %831 = insertelement <4 x float> %830, float 0.000000e+00, i32 3
+  %832 = call float @llvm.AMDGPU.dp4(<4 x float> %827, <4 x float> %831)
+  %833 = fcmp uge float 0x3FECCCCCC0000000, %832
+  %834 = select i1 %833, float 0x3FECCCCCC0000000, float %832
+  %835 = fmul float %821, %834
+  %836 = fmul float %822, %834
+  %837 = fmul float %823, %834
+  br label %ENDIF172
+
+ENDIF172:                                         ; preds = %ENDIF169, %IF173
+  %temp84.4 = phi float [ %806, %IF173 ], [ %temp84.3, %ENDIF169 ]
+  %temp85.4 = phi float [ %807, %IF173 ], [ %temp85.3, %ENDIF169 ]
+  %temp86.4 = phi float [ %808, %IF173 ], [ %temp86.3, %ENDIF169 ]
+  %temp87.4 = phi float [ %809, %IF173 ], [ %temp87.3, %ENDIF169 ]
+  %temp92.10 = phi float [ %835, %IF173 ], [ %temp92.9, %ENDIF169 ]
+  %temp93.4 = phi float [ %836, %IF173 ], [ %temp93.3, %ENDIF169 ]
+  %temp94.4 = phi float [ %837, %IF173 ], [ %temp94.3, %ENDIF169 ]
+  %838 = fcmp oge float %179, 2.670000e+03
+  %839 = sext i1 %838 to i32
+  %840 = bitcast i32 %839 to float
+  %841 = bitcast float %840 to i32
+  %842 = icmp ne i32 %841, 0
+  br i1 %842, label %IF176, label %ENDIF175
+
+IF176:                                            ; preds = %ENDIF172
+  %843 = fmul float %202, 0x3FB99999A0000000
+  %844 = fcmp uge float 0.000000e+00, %843
+  %845 = select i1 %844, float 0.000000e+00, float %843
+  %846 = fcmp uge float %845, 0x3FD99999A0000000
+  %847 = select i1 %846, float 0x3FD99999A0000000, float %845
+  %848 = call float @llvm.AMDGPU.lrp(float %847, float %400, float %300)
+  %849 = call float @llvm.AMDGPU.lrp(float %847, float %401, float %301)
+  %850 = call float @llvm.AMDGPU.lrp(float %847, float %402, float %302)
+  %851 = call float @llvm.AMDGPU.lrp(float %847, float %403, float %303)
+  %852 = insertelement <4 x float> undef, float %329, i32 0
+  %853 = insertelement <4 x float> %852, float %330, i32 1
+  %854 = insertelement <4 x float> %853, float %331, i32 2
+  %855 = insertelement <4 x float> %854, float 0.000000e+00, i32 3
+  %856 = insertelement <4 x float> undef, float %63, i32 0
+  %857 = insertelement <4 x float> %856, float %65, i32 1
+  %858 = insertelement <4 x float> %857, float %67, i32 2
+  %859 = insertelement <4 x float> %858, float 0.000000e+00, i32 3
+  %860 = call float @llvm.AMDGPU.dp4(<4 x float> %855, <4 x float> %859)
+  %861 = fcmp uge float 0x3FEB333340000000, %860
+  %862 = select i1 %861, float 0x3FEB333340000000, float %860
+  %863 = fmul float %8, %862
+  %864 = fmul float %13, %862
+  %865 = fmul float %18, %862
+  %866 = insertelement <4 x float> undef, float %34, i32 0
+  %867 = insertelement <4 x float> %866, float %35, i32 1
+  %868 = insertelement <4 x float> %867, float %36, i32 2
+  %869 = insertelement <4 x float> %868, float 0.000000e+00, i32 3
+  %870 = insertelement <4 x float> undef, float %63, i32 0
+  %871 = insertelement <4 x float> %870, float %65, i32 1
+  %872 = insertelement <4 x float> %871, float %67, i32 2
+  %873 = insertelement <4 x float> %872, float 0.000000e+00, i32 3
+  %874 = call float @llvm.AMDGPU.dp4(<4 x float> %869, <4 x float> %873)
+  %875 = fcmp uge float 0x3FECCCCCC0000000, %874
+  %876 = select i1 %875, float 0x3FECCCCCC0000000, float %874
+  %877 = fmul float %863, %876
+  %878 = fmul float %864, %876
+  %879 = fmul float %865, %876
+  br label %ENDIF175
+
+ENDIF175:                                         ; preds = %ENDIF172, %IF176
+  %temp84.5 = phi float [ %848, %IF176 ], [ %temp84.4, %ENDIF172 ]
+  %temp85.5 = phi float [ %849, %IF176 ], [ %temp85.4, %ENDIF172 ]
+  %temp86.5 = phi float [ %850, %IF176 ], [ %temp86.4, %ENDIF172 ]
+  %temp87.5 = phi float [ %851, %IF176 ], [ %temp87.4, %ENDIF172 ]
+  %temp92.11 = phi float [ %877, %IF176 ], [ %temp92.10, %ENDIF172 ]
+  %temp93.5 = phi float [ %878, %IF176 ], [ %temp93.4, %ENDIF172 ]
+  %temp94.5 = phi float [ %879, %IF176 ], [ %temp94.4, %ENDIF172 ]
+  %880 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %881 = extractelement <4 x float> %880, i32 0
+  %882 = fcmp olt float %881, %179
+  %883 = sext i1 %882 to i32
+  %884 = bitcast i32 %883 to float
+  %885 = bitcast float %884 to i32
+  %886 = icmp ne i32 %885, 0
+  br i1 %886, label %IF179, label %ENDIF178
+
+IF179:                                            ; preds = %ENDIF175
+  %887 = fadd float %202, 1.000000e+00
+  %888 = fadd float %202, 1.000000e+00
+  %889 = fadd float %202, 1.000000e+00
+  %890 = insertelement <4 x float> undef, float %43, i32 0
+  %891 = insertelement <4 x float> %890, float %44, i32 1
+  %892 = insertelement <4 x float> %891, float %45, i32 2
+  %893 = insertelement <4 x float> %892, float 0.000000e+00, i32 3
+  %894 = insertelement <4 x float> undef, float %43, i32 0
+  %895 = insertelement <4 x float> %894, float %44, i32 1
+  %896 = insertelement <4 x float> %895, float %45, i32 2
+  %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3
+  %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897)
+  %899 = call float @llvm.AMDGPU.rsq.f32(float %898)
+  %900 = fmul float %45, %899
+  %901 = call float @fabs(float %900)
+  %902 = fmul float %176, 0x3FECCCCCC0000000
+  %903 = fadd float %902, %901
+  %904 = fadd float %903, 0xBFEFAE1480000000
+  %905 = fmul float %904, 0xC043FFFE20000000
+  %906 = call float @llvm.AMDIL.clamp.(float %905, float 0.000000e+00, float 1.000000e+00)
+  %907 = fmul float 2.000000e+00, %906
+  %908 = fsub float -0.000000e+00, %907
+  %909 = fadd float 3.000000e+00, %908
+  %910 = fmul float %906, %909
+  %911 = fmul float %906, %910
+  %912 = call float @llvm.AMDGPU.lrp(float %911, float %temp84.5, float %887)
+  %913 = call float @llvm.AMDGPU.lrp(float %911, float %temp85.5, float %888)
+  %914 = call float @llvm.AMDGPU.lrp(float %911, float %temp86.5, float %889)
+  %915 = call float @llvm.AMDGPU.lrp(float %911, float %temp87.5, float 0.000000e+00)
+  %916 = fmul float %202, 5.000000e-01
+  %917 = fcmp uge float 0x3FE4CCCCC0000000, %916
+  %918 = select i1 %917, float 0x3FE4CCCCC0000000, float %916
+  %919 = fcmp uge float %918, 0x3FE3333340000000
+  %920 = select i1 %919, float 0x3FE3333340000000, float %918
+  %921 = call float @llvm.AMDGPU.lrp(float %920, float %912, float %temp84.5)
+  %922 = call float @llvm.AMDGPU.lrp(float %920, float %913, float %temp85.5)
+  %923 = call float @llvm.AMDGPU.lrp(float %920, float %914, float %temp86.5)
+  %924 = call float @llvm.AMDGPU.lrp(float %920, float %915, float %temp87.5)
+  %925 = insertelement <4 x float> undef, float %329, i32 0
+  %926 = insertelement <4 x float> %925, float %330, i32 1
+  %927 = insertelement <4 x float> %926, float %331, i32 2
+  %928 = insertelement <4 x float> %927, float 0.000000e+00, i32 3
+  %929 = insertelement <4 x float> undef, float %63, i32 0
+  %930 = insertelement <4 x float> %929, float %65, i32 1
+  %931 = insertelement <4 x float> %930, float %67, i32 2
+  %932 = insertelement <4 x float> %931, float 0.000000e+00, i32 3
+  %933 = call float @llvm.AMDGPU.dp4(<4 x float> %928, <4 x float> %932)
+  %934 = fcmp uge float 0x3FE99999A0000000, %933
+  %935 = select i1 %934, float 0x3FE99999A0000000, float %933
+  %936 = fmul float %8, %935
+  %937 = fmul float %13, %935
+  %938 = fmul float %18, %935
+  %939 = insertelement <4 x float> undef, float %34, i32 0
+  %940 = insertelement <4 x float> %939, float %35, i32 1
+  %941 = insertelement <4 x float> %940, float %36, i32 2
+  %942 = insertelement <4 x float> %941, float 0.000000e+00, i32 3
+  %943 = insertelement <4 x float> undef, float %63, i32 0
+  %944 = insertelement <4 x float> %943, float %65, i32 1
+  %945 = insertelement <4 x float> %944, float %67, i32 2
+  %946 = insertelement <4 x float> %945, float 0.000000e+00, i32 3
+  %947 = call float @llvm.AMDGPU.dp4(<4 x float> %942, <4 x float> %946)
+  %948 = fcmp uge float 0x3FECCCCCC0000000, %947
+  %949 = select i1 %948, float 0x3FECCCCCC0000000, float %947
+  %950 = fmul float %936, %949
+  %951 = fmul float %937, %949
+  %952 = fmul float %938, %949
+  br label %ENDIF178
+
+ENDIF178:                                         ; preds = %ENDIF175, %IF179
+  %temp84.6 = phi float [ %921, %IF179 ], [ %temp84.5, %ENDIF175 ]
+  %temp85.6 = phi float [ %922, %IF179 ], [ %temp85.5, %ENDIF175 ]
+  %temp86.6 = phi float [ %923, %IF179 ], [ %temp86.5, %ENDIF175 ]
+  %temp87.6 = phi float [ %924, %IF179 ], [ %temp87.5, %ENDIF175 ]
+  %temp92.12 = phi float [ %950, %IF179 ], [ %temp92.11, %ENDIF175 ]
+  %temp93.6 = phi float [ %951, %IF179 ], [ %temp93.5, %ENDIF175 ]
+  %temp94.6 = phi float [ %952, %IF179 ], [ %temp94.5, %ENDIF175 ]
+  %953 = fmul float %55, %temp92.12
+  %954 = fmul float %57, %temp93.6
+  %955 = fmul float %59, %temp94.6
+  %956 = fmul float %61, 0.000000e+00
+  %957 = fmul float %temp84.6, %953
+  %958 = fmul float %temp85.6, %954
+  %959 = fmul float %temp86.6, %955
+  %960 = fmul float %temp87.6, %956
+  %961 = fmul float %2, -2.000000e+00
+  %962 = fadd float %961, 1.000000e+00
+  %963 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
+  %964 = extractelement <4 x float> %963, i32 2
+  %965 = fsub float -0.000000e+00, %964
+  %966 = fadd float %962, %965
+  %967 = fdiv float 1.000000e+00, %966
+  %968 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24)
+  %969 = extractelement <4 x float> %968, i32 2
+  %970 = fmul float %969, %967
+  %971 = fsub float -0.000000e+00, %53
+  %972 = fmul float %971, %53
+  %973 = fmul float %972, %970
+  %974 = fmul float %973, %970
+  %975 = fmul float %974, 0x3FF7154760000000
+  %976 = call float @llvm.AMDIL.exp.(float %975)
+  %977 = fcmp oeq float %53, 1.000000e+00
+  %978 = sext i1 %977 to i32
+  %979 = bitcast i32 %978 to float
+  %980 = bitcast float %979 to i32
+  %981 = icmp ne i32 %980, 0
+  %.184 = select i1 %981, float 1.000000e+00, float %976
+  %982 = call float @llvm.AMDGPU.lrp(float %.184, float %957, float %47)
+  %983 = call float @llvm.AMDGPU.lrp(float %.184, float %958, float %49)
+  %984 = call float @llvm.AMDGPU.lrp(float %.184, float %959, float %51)
+  %985 = insertelement <4 x float> undef, float %982, i32 0
+  %986 = insertelement <4 x float> %985, float %983, i32 1
+  %987 = insertelement <4 x float> %986, float %984, i32 2
+  %988 = insertelement <4 x float> %987, float %960, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %988, i32 0, i32 0)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq.f32(float) #1
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+
+; Function Attrs: readonly
+declare float @fabs(float) #2
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.exp.(float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.lrp(float, float, float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.clamp.(float, float, float) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { readnone }
+attributes #2 = { readonly }
diff --git a/test/CodeGen/AMDGPU/bitcast.ll b/test/CodeGen/AMDGPU/bitcast.ll
new file mode 100644
index 00000000000..fd56d956bf3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bitcast.ll
@@ -0,0 +1,79 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; This test just checks that the compiler doesn't crash.
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+; FUNC-LABEL: {{^}}v32i8_to_v8i32:
+; SI: s_endpgm
+define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
+entry:
+  %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0
+  %2 = bitcast <32 x i8> %1 to <8 x i32>
+  %3 = extractelement <8 x i32> %2, i32 1
+  %4 = icmp ne i32 %3, 0
+  %5 = select i1 %4, float 0.0, float 1.0
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i8ptr_v16i8ptr:
+; SI: s_endpgm
+define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
+  %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
+  store <16 x i8> %1, <16 x i8> addrspace(1)* %out
+  ret void
+}
+
+define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %load = load float, float addrspace(1)* %in, align 4
+  %bc = bitcast float %load to <2 x i16>
+  store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
+  %bc = bitcast <2 x i16> %load to float
+  store float %bc, float addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %bc = bitcast <4 x i8> %load to i32
+  store i32 %bc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %bc = bitcast i32 %load to <4 x i8>
+  store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64:
+; SI: s_endpgm
+define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
+  %add = add <2 x i32> %val, <i32 4, i32 9>
+  %bc = bitcast <2 x i32> %add to double
+  store double %bc, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
+; SI: s_endpgm
+define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+  %val = load double, double addrspace(1)* %in, align 8
+  %add = fadd double %val, 4.0
+  %bc = bitcast double %add to <2 x i32>
+  store <2 x i32> %bc, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/bswap.ll b/test/CodeGen/AMDGPU/bswap.ll
new file mode 100644
index 00000000000..4cf8e4bfed5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bswap.ll
@@ -0,0 +1,115 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.bswap.i32(i32) nounwind readnone
+declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone
+declare i64 @llvm.bswap.i64(i64) nounwind readnone
+declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone
+declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
+
+; FUNC-LABEL: @test_bswap_i32
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8
+; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24
+; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff
+; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
+  store i32 %bswap, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_bswap_v2i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI: s_endpgm
+define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
+  %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
+  store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_bswap_v4i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI: s_endpgm
+define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
+  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
+  store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_bswap_v8i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI: s_endpgm
+define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
+  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
+  %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
+  store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %val = load i64, i64 addrspace(1)* %in, align 8
+  %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
+  store i64 %bswap, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
+  %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
+  store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16
+  ret void
+}
+
+define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
+  %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
+  %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
+  store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/build_vector.ll b/test/CodeGen/AMDGPU/build_vector.ll
new file mode 100644
index 00000000000..65eacf5adc4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/build_vector.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
+
+; R600: {{^}}build_vector2:
+; R600: MOV
+; R600: MOV
+; R600-NOT: MOV
+; SI: {{^}}build_vector2:
+; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
+; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
+; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}}
+define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
+entry:
+  store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600: {{^}}build_vector4:
+; R600: MOV
+; R600: MOV
+; R600: MOV
+; R600: MOV
+; R600-NOT: MOV
+; SI: {{^}}build_vector4:
+; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
+; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
+; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
+; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
+; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}}
+define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
+entry:
+  store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/call.ll b/test/CodeGen/AMDGPU/call.ll
new file mode 100644
index 00000000000..e769fd11c28
--- /dev/null
+++ b/test/CodeGen/AMDGPU/call.ll
@@ -0,0 +1,33 @@
+; RUN: not llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s 2>&1 | FileCheck %s
+; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported call to function external_function in test_call_external
+
+
+declare i32 @external_function(i32) nounwind
+
+define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %c = call i32 @external_function(i32 %b) nounwind
+  %result = add i32 %a, %c
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define i32 @defined_function(i32 %x) nounwind noinline {
+  %y = add i32 %x, 8
+  ret i32 %y
+}
+
+define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %c = call i32 @defined_function(i32 %b) nounwind
+  %result = add i32 %a, %c
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/call_fs.ll b/test/CodeGen/AMDGPU/call_fs.ll
new file mode 100644
index 00000000000..87bebbc49d5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/call_fs.ll
@@ -0,0 +1,17 @@
+
+; RUN: llc < %s -march=r600 -mcpu=redwood -show-mc-encoding -o - | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=r600 -mcpu=rv710 -show-mc-encoding -o - | FileCheck --check-prefix=R600 %s
+
+; EG: .long 257
+; EG: {{^}}call_fs:
+; EG: CALL_FS  ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0xc0,0x84]
+; R600: .long 257
+; R600: {{^}}call_fs:
+; R600:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89]
+
+
+define void @call_fs() #0 {
+  ret void
+}
+
+attributes #0 = { "ShaderType"="1" } ; Vertex Shader
diff --git a/test/CodeGen/AMDGPU/cayman-loop-bug.ll b/test/CodeGen/AMDGPU/cayman-loop-bug.ll
new file mode 100644
index 00000000000..c7b8c403731
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cayman-loop-bug.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: LOOP_START_DX10
+; CHECK: ALU_PUSH_BEFORE
+; CHECK: LOOP_START_DX10
+; CHECK: PUSH
+; CHECK-NOT: ALU_PUSH_BEFORE
+; CHECK: END_LOOP
+; CHECK: END_LOOP
+define void @main (<4 x float> inreg %reg0) #0 {
+entry:
+  br label %outer_loop
+outer_loop:
+  %cnt = phi i32 [0, %entry], [%cnt_incr, %inner_loop]
+  %cond = icmp eq i32 %cnt, 16
+  br i1 %cond, label %outer_loop_body, label %exit
+outer_loop_body:
+  %cnt_incr = add i32 %cnt, 1
+  br label %inner_loop
+inner_loop:
+  %cnt2 = phi i32 [0, %outer_loop_body], [%cnt2_incr, %inner_loop_body]
+  %cond2 = icmp eq i32 %cnt2, 16
+  br i1 %cond, label %inner_loop_body, label %outer_loop
+inner_loop_body:
+  %cnt2_incr = add i32 %cnt2, 1
+  br label %inner_loop
+exit:
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/cf-stack-bug.ll b/test/CodeGen/AMDGPU/cf-stack-bug.ll
new file mode 100644
index 00000000000..75b87e48622
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cf-stack-bug.ll
@@ -0,0 +1,244 @@
+; RUN: llc -march=r600 -mcpu=redwood -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=BUG64 %s < %t
+
+; RUN: llc -march=r600 -mcpu=sumo -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=BUG64 %s < %t
+
+; RUN: llc -march=r600 -mcpu=barts -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=BUG64 %s < %t
+
+; RUN: llc -march=r600 -mcpu=turks -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=BUG64 %s < %t
+
+; RUN: llc -march=r600 -mcpu=caicos -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=BUG64 %s < %t
+
+; RUN: llc -march=r600 -mcpu=cedar -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=BUG32 %s < %t
+
+; RUN: llc -march=r600 -mcpu=juniper -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=NOBUG %s < %t
+
+; RUN: llc -march=r600 -mcpu=cypress -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=NOBUG %s < %t
+
+; RUN: llc -march=r600 -mcpu=cayman -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=NOBUG %s < %t
+
+; REQUIRES: asserts
+
+; We are currently allocating 2 extra sub-entries on Evergreen / NI for
+; non-WQM push instructions if we change this to 1, then we will need to
+; add one level of depth to each of these tests.
+
+; BUG64-NOT: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: {{^}}nested3:
+define void @nested3(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.store.1
+
+if.store.1:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: {{^}}nested4:
+define void @nested4(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  %3 = icmp sgt i32 %cond, 30
+  br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+if.4:
+  store i32 4, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: {{^}}nested7:
+define void @nested7(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  %3 = icmp sgt i32 %cond, 30
+  br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+if.4:
+  %4 = icmp sgt i32 %cond, 40
+  br i1 %4, label %if.5, label %if.4.store
+
+if.4.store:
+  store i32 4, i32 addrspace(1)* %out
+  br label %end
+
+if.5:
+  %5 = icmp sgt i32 %cond, 50
+  br i1 %5, label %if.6, label %if.5.store
+
+if.5.store:
+  store i32 5, i32 addrspace(1)* %out
+  br label %end
+
+if.6:
+  %6 = icmp sgt i32 %cond, 60
+  br i1 %6, label %if.7, label %if.6.store
+
+if.6.store:
+  store i32 6, i32 addrspace(1)* %out
+  br label %end
+
+if.7:
+  store i32 7, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: {{^}}nested8:
+define void @nested8(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  %3 = icmp sgt i32 %cond, 30
+  br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+if.4:
+  %4 = icmp sgt i32 %cond, 40
+  br i1 %4, label %if.5, label %if.4.store
+
+if.4.store:
+  store i32 4, i32 addrspace(1)* %out
+  br label %end
+
+if.5:
+  %5 = icmp sgt i32 %cond, 50
+  br i1 %5, label %if.6, label %if.5.store
+
+if.5.store:
+  store i32 5, i32 addrspace(1)* %out
+  br label %end
+
+if.6:
+  %6 = icmp sgt i32 %cond, 60
+  br i1 %6, label %if.7, label %if.6.store
+
+if.6.store:
+  store i32 6, i32 addrspace(1)* %out
+  br label %end
+
+if.7:
+  %7 = icmp sgt i32 %cond, 70
+  br i1 %7, label %if.8, label %if.7.store
+
+if.7.store:
+  store i32 7, i32 addrspace(1)* %out
+  br label %end
+
+if.8:
+  store i32 8, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/cf_end.ll b/test/CodeGen/AMDGPU/cf_end.ll
new file mode 100644
index 00000000000..c74ee22868d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cf_end.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood --show-mc-encoding | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=r600 -mcpu=caicos --show-mc-encoding | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=r600 -mcpu=cayman --show-mc-encoding | FileCheck --check-prefix=CM %s
+
+; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80]
+; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88]
+define void @eop() {
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
new file mode 100644
index 00000000000..77f7bd01b7f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -0,0 +1,242 @@
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+; OPT-LABEL: @test_sink_global_small_offset_i32(
+; OPT-NOT: getelementptr i32, i32 addrspace(1)* %in
+; OPT: br i1
+; OPT: ptrtoint
+
+; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
+; GCN: {{^}}BB0_2:
+define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(1)* %in.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset(
+; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
+; OPT: br i1
+
+; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
+; GCN: s_and_saveexec_b64
+; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN: {{^}}BB1_2:
+; GCN: s_or_b64 exec
+define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
+  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i8, i8 addrspace(1)* %in.gep
+  %tmp2 = sext i8 %tmp1 to i32
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
+; GCN: s_and_saveexec_b64
+; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
+; GCN: {{^}}BB2_2:
+; GCN: s_or_b64 exec
+define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
+  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i8, i8 addrspace(1)* %in.gep
+  %tmp2 = sext i8 %tmp1 to i32
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
+; GCN: s_and_saveexec_b64
+; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN: {{^}}BB3_2:
+; GCN: s_or_b64 exec
+define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
+  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i8, i8 addrspace(1)* %in.gep
+  %tmp2 = sext i8 %tmp1 to i32
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
+; OPT: getelementptr i32, i32 addrspace(4)* %in
+; OPT: br i1
+; OPT-NOT: ptrtoint
+
+; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
+; GCN: flat_load_dword
+; GCN: {{^}}BB4_2:
+
+define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(4)* %in.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(4)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_scratch_small_offset_i32(
+; OPT-NOT:  getelementptr [512 x i32]
+; OPT: br i1
+; OPT: ptrtoint
+
+; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
+; GCN: s_and_saveexec_b64
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
+; GCN: {{^}}BB5_2:
+define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
+entry:
+  %alloca = alloca [512 x i32], align 4
+  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %add.arg = add i32 %arg, 8
+  %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  store volatile i32 123, i32* %alloca.gep
+  %tmp1 = load volatile i32, i32* %alloca.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep.0
+  %load = load volatile i32, i32* %alloca.gep
+  store i32 %load, i32 addrspace(1)* %out.gep.1
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_no_sink_scratch_large_offset_i32(
+; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024
+; OPT: br i1
+; OPT-NOT: ptrtoint
+
+; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32:
+; GCN: s_and_saveexec_b64
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN: {{^}}BB6_2:
+define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
+entry:
+  %alloca = alloca [512 x i32], align 4
+  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %add.arg = add i32 %arg, 8
+  %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  store volatile i32 123, i32* %alloca.gep
+  %tmp1 = load volatile i32, i32* %alloca.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep.0
+  %load = load volatile i32, i32* %alloca.gep
+  store i32 %load, i32 addrspace(1)* %out.gep.1
+  br label %done
+
+done:
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
+; GCN: s_and_saveexec_b64
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN: {{^}}BB7_2:
+define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) {
+entry:
+  %offset.ext = zext i32 %offset to i64
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(1)* %in.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/coalescer_remat.ll b/test/CodeGen/AMDGPU/coalescer_remat.ll
new file mode 100644
index 00000000000..96730bcf2e8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs -mtriple=amdgcn-- -o - %s | FileCheck %s
+
+declare float @llvm.fma.f32(float, float, float)
+
+; This checks that rematerialization support of the coalescer does not
+; unnecessarily widen the register class. Without those fixes > 20 VGprs
+; are used here
+; Also check that some rematerialization of the 0 constant happened.
+; CHECK-LABEL: foobar
+; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
+; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
+; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
+; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
+; It's probably OK if this is slightly higher:
+; CHECK: ; NumVgprs: 9
+define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
+entry:
+  %cmpflag = icmp eq i32 %flag, 1
+  br i1 %cmpflag, label %loop, label %exit
+
+loop:
+  %c = phi i32 [0, %entry], [%cnext, %loop]
+  %v0 = phi float [0.0, %entry], [%fma.0, %loop]
+  %v1 = phi float [0.0, %entry], [%fma.1, %loop]
+  %v2 = phi float [0.0, %entry], [%fma.2, %loop]
+  %v3 = phi float [0.0, %entry], [%fma.3, %loop]
+
+  ; Try to get the 0 constant to get coalesced into a wide register
+  %blup = insertelement <4 x float> undef, float %v0, i32 0
+  store <4 x float> %blup, <4 x float> addrspace(1)* %out
+
+  %load = load <4 x float>, <4 x float> addrspace(1)* %in
+  %load.0 = extractelement <4 x float> %load, i32 0
+  %load.1 = extractelement <4 x float> %load, i32 1
+  %load.2 = extractelement <4 x float> %load, i32 2
+  %load.3 = extractelement <4 x float> %load, i32 3
+  %fma.0 = call float @llvm.fma.f32(float %v0, float %load.0, float %v0)
+  %fma.1 = call float @llvm.fma.f32(float %v1, float %load.1, float %v1)
+  %fma.2 = call float @llvm.fma.f32(float %v2, float %load.2, float %v2)
+  %fma.3 = call float @llvm.fma.f32(float %v3, float %load.3, float %v3)
+
+  %cnext = add nsw i32 %c, 1
+  %cmp = icmp eq i32 %cnext, 42
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  %ev0 = phi float [0.0, %entry], [%fma.0, %loop]
+  %ev1 = phi float [0.0, %entry], [%fma.1, %loop]
+  %ev2 = phi float [0.0, %entry], [%fma.2, %loop]
+  %ev3 = phi float [0.0, %entry], [%fma.3, %loop]
+  %dst.0 = insertelement <4 x float> undef,  float %ev0, i32 0
+  %dst.1 = insertelement <4 x float> %dst.0, float %ev1, i32 1
+  %dst.2 = insertelement <4 x float> %dst.1, float %ev2, i32 2
+  %dst.3 = insertelement <4 x float> %dst.2, float %ev3, i32 3
+  store <4 x float> %dst.3, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll b/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
new file mode 100644
index 00000000000..58517209267
--- /dev/null
+++ b/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
@@ -0,0 +1,18 @@
+; RUN: opt -mtriple=amdgcn-- -codegenprepare -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI-LLC %s
+
+; OPT-LABEL: @test(
+; OPT: mul nsw i32
+; OPT-NEXT: sext
+
+; SI-LLC-LABEL: {{^}}test:
+; SI-LLC: s_mul_i32
+; SI-LLC-NOT: mul
+define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
+entry:
+  %0 = mul nsw i32 %a, 3
+  %1 = sext i32 %0 to i64
+  %2 = getelementptr i8, i8 addrspace(1)* %in, i64 %1
+  store i8 %b, i8 addrspace(1)* %2
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/combine_vloads.ll b/test/CodeGen/AMDGPU/combine_vloads.ll
new file mode 100644
index 00000000000..01572afa620
--- /dev/null
+++ b/test/CodeGen/AMDGPU/combine_vloads.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+
+;
+; kernel void combine_vloads(global char8* src, global char8* result) {
+;   for (int i = 0; i < 1024; ++i)
+;     result[i] = src[0] + src[1] + src[2] + src[3];
+; }
+;
+
+
+; 128-bit loads instead of many 8-bit
+; EG-LABEL: {{^}}combine_vloads:
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
+entry:
+  br label %for.body
+
+for.exit:                                         ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ]
+  %arrayidx_v4 = bitcast <8 x i8> addrspace(1)* %src to <32 x i8> addrspace(1)*
+  %0 = bitcast <32 x i8> addrspace(1)* %arrayidx_v4 to <8 x i32> addrspace(1)*
+  %vecload2 = load <8 x i32>, <8 x i32> addrspace(1)* %0, align 32
+  %1 = bitcast <8 x i32> %vecload2 to <32 x i8>
+  %tmp5 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %tmp8 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp9 = add nsw <8 x i8> %tmp5, %tmp8
+  %tmp12 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %tmp13 = add nsw <8 x i8> %tmp9, %tmp12
+  %tmp16 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tmp17 = add nsw <8 x i8> %tmp13, %tmp16
+  %scevgep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %result, i32 %i.01
+  %2 = bitcast <8 x i8> %tmp17 to <2 x i32>
+  %3 = bitcast <8 x i8> addrspace(1)* %scevgep to <2 x i32> addrspace(1)*
+  store <2 x i32> %2, <2 x i32> addrspace(1)* %3, align 8
+  %tmp19 = add nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %tmp19, 1024
+  br i1 %exitcond, label %for.exit, label %for.body
+}
diff --git a/test/CodeGen/AMDGPU/commute-compares.ll b/test/CodeGen/AMDGPU/commute-compares.ll
new file mode 100644
index 00000000000..31766047a35
--- /dev/null
+++ b/test/CodeGen/AMDGPU/commute-compares.ll
@@ -0,0 +1,697 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+; --------------------------------------------------------------------------------
+; i32 compares
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}commute_eq_64_i32:
+; GCN: v_cmp_eq_i32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp eq i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ne_64_i32:
+; GCN: v_cmp_ne_i32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ne i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; FIXME: Why isn't this being folded as a constant?
+; GCN-LABEL: {{^}}commute_ne_litk_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
+; GCN: v_cmp_ne_i32_e32 vcc, [[K]], v{{[0-9]+}}
+define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ne i32 %val, 12345
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ugt_64_i32:
+; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ugt i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uge_64_i32:
+; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
+define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp uge i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ult_64_i32:
+; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ult i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ule_63_i32:
+; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ule i32 %val, 63
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm
+
+; GCN-LABEL: {{^}}commute_ule_64_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
+; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
+define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ule i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
+; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}}
+define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp sgt i32 %val, -1
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sge_neg2_i32:
+; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
+define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp sge i32 %val, -2
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_slt_neg16_i32:
+; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
+define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp slt i32 %val, -16
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sle_5_i32:
+; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
+define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp sle i32 %val, 5
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; i64 compares
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}commute_eq_64_i64:
+; GCN: v_cmp_eq_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp eq i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ne_64_i64:
+; GCN: v_cmp_ne_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ne i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ugt_64_i64:
+; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ugt i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uge_64_i64:
+; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp uge i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ult_64_i64:
+; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ult i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ule_63_i64:
+; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ule i64 %val, 63
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm
+
+; GCN-LABEL: {{^}}commute_ule_64_i64:
+; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
+; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ule i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
+; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp sgt i64 %val, -1
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sge_neg2_i64:
+; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp sge i64 %val, -2
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_slt_neg16_i64:
+; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp slt i64 %val, -16
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sle_5_i64:
+; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp sle i64 %val, 5
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; f32 compares
+; --------------------------------------------------------------------------------
+
+
+; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
+; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp oeq float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
+; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ogt float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_oge_2.0_f32:
+; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp oge float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_olt_2.0_f32:
+; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp olt float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ole_2.0_f32:
+; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ole float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_one_2.0_f32:
+; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp one float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ord_2.0_f32:
+; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
+define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ord float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
+; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ueq float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
+; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ugt float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uge_2.0_f32:
+; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp uge float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ult_2.0_f32:
+; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ult float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ule_2.0_f32:
+; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ule float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_une_2.0_f32:
+; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp une float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uno_2.0_f32:
+; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
+define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp uno float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; f64 compares
+; --------------------------------------------------------------------------------
+
+
+; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
+; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp oeq double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
+; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ogt double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_oge_2.0_f64:
+; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp oge double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_olt_2.0_f64:
+; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp olt double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ole_2.0_f64:
+; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ole double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_one_2.0_f64:
+; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp one double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ord_2.0_f64:
+; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
+define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ord double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
+; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ueq double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
+; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ugt double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uge_2.0_f64:
+; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp uge double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ult_2.0_f64:
+; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ult double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ule_2.0_f64:
+; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ule double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_une_2.0_f64:
+; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp une double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uno_2.0_f64:
+; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
+define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp uno double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/commute_modifiers.ll b/test/CodeGen/AMDGPU/commute_modifiers.ll
new file mode 100644
index 00000000000..7fc36eabb78
--- /dev/null
+++ b/test/CodeGen/AMDGPU/commute_modifiers.ll
@@ -0,0 +1,181 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+
+; FUNC-LABEL: @commute_add_imm_fabs_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, |[[X]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %x = load float, float addrspace(1)* %gep.0
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %z = fadd float 2.0, %x.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_imm_fneg_fabs_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], -4.0, |[[X]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %x = load float, float addrspace(1)* %gep.0
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %x.fneg.fabs = fsub float -0.000000e+00, %x.fabs
+  %z = fmul float 4.0, %x.fneg.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_imm_fneg_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %x = load float, float addrspace(1)* %gep.0
+  %x.fneg = fsub float -0.000000e+00, %x
+  %z = fmul float 4.0, %x.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should use SGPR for literal.
+; FUNC-LABEL: @commute_add_lit_fabs_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %x = load float, float addrspace(1)* %gep.0
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %z = fadd float 1024.0, %x.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_add_fabs_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %z = fadd float %x, %y.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fneg_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %y.fneg = fsub float -0.000000e+00, %y
+  %z = fmul float %x, %y.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fabs_fneg_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
+  %z = fmul float %x, %y.fabs.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; There's no reason to commute this.
+; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %z = fmul float %x.fabs, %y.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
+  %z = fmul float %x.fabs, %y.fabs.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; Make sure we commute the multiply part for the constant in src0 even
+; though we have negate modifier on src2.
+
+; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32
+; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], |[[R2]]|
+; SI: buffer_store_dword [[RESULT]]
+define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %r2.fabs = call float @llvm.fabs.f32(float %r2)
+
+  %r3 = tail call float @llvm.fma.f32(float %r1, float 2.0, float %r2.fabs)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/complex-folding.ll b/test/CodeGen/AMDGPU/complex-folding.ll
new file mode 100644
index 00000000000..a5399a71324
--- /dev/null
+++ b/test/CodeGen/AMDGPU/complex-folding.ll
@@ -0,0 +1,19 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: {{^}}main:
+; CHECK-NOT: MOV
+define void @main(<4 x float> inreg %reg0) #0 {
+entry:
+  %0 = extractelement <4 x float> %reg0, i32 0
+  %1 = call float @fabs(float %0)
+  %2 = fptoui float %1 to i32
+  %3 = bitcast i32 %2 to float
+  %4 = insertelement <4 x float> undef, float %3, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %4, i32 0, i32 0)
+  ret void
+}
+
+declare float @fabs(float ) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/concat_vectors.ll b/test/CodeGen/AMDGPU/concat_vectors.ll
new file mode 100644
index 00000000000..a09ed1f7385
--- /dev/null
+++ b/test/CodeGen/AMDGPU/concat_vectors.ll
@@ -0,0 +1,296 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_concat_v1i32:
+; 0x80f000 is the high 32 bits of the resource descriptor used by MUBUF
+; instructions that access scratch memory.  Bit 23, which is the add_tid_enable
+; bit, is only set for scratch access, so we can check for the absence of this
+; value if we want to ensure scratch memory is not being used.
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+  %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+  %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+  %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
+  %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
+  %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+  %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+  %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+  %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+  %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+  %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+  %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
+  %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x i1> %concat, <2 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
+  %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i1> %concat, <4 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
+  %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i1> %concat, <8 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
+  %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i1> %concat, <16 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
+  %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x i1> %concat, <32 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v32i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
+  %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  store <64 x i1> %concat, <64 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
+  %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
+  %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+  %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+  %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
+  %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}concat_vector_crash:
+; SI: s_endpgm
+define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+bb:
+  %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
+  %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %tmp2 = shufflevector <8 x float> undef, <8 x float> %tmp1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+  store <8 x float> %tmp2, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll
new file mode 100644
index 00000000000..8b397566066
--- /dev/null
+++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -0,0 +1,167 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_copy_v4i8:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x2:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x3:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x4:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI_DAG: buffer_store_byte
+
+; After scalarizing v4i8 loads is fixed.
+; XSI: buffer_load_dword
+; XSI: V_BFE
+; XSI: V_ADD
+; XSI: V_ADD
+; XSI: V_ADD
+; XSI: buffer_store_dword
+; XSI: buffer_store_dword
+
+; SI: s_endpgm
+define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI_DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI_DAG: buffer_store_byte
+
+; XSI: buffer_load_dword
+; XSI: BFE
+; XSI: buffer_store_dword
+; XSI: V_ADD
+; XSI: buffer_store_dword
+; XSI-NEXT: buffer_store_dword
+
+; SI: s_endpgm
+define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v3i8:
+; SI-NOT: bfe
+; SI-NOT: bfi
+; SI: s_endpgm
+define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+  %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
+  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: s_endpgm
+define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/copy-to-reg.ll b/test/CodeGen/AMDGPU/copy-to-reg.ll
new file mode 100644
index 00000000000..fc875f6ef7a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/copy-to-reg.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s
+
+; Test that CopyToReg instructions don't have non-register operands prior
+; to being emitted.
+
+; Make sure this doesn't crash
+; CHECK-LABEL: {{^}}copy_to_reg_frameindex:
+define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+  %alloca = alloca [16 x i32]
+  br label %loop
+
+loop:
+  %inc = phi i32 [0, %entry], [%inc.i, %loop]
+  %ptr = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %inc
+  store i32 %inc, i32* %ptr
+  %inc.i = add i32 %inc, 1
+  %cnd = icmp uge i32 %inc.i, 16
+  br i1 %cnd, label %done, label %loop
+
+done:
+  %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 0
+  %tmp1 = load i32, i32* %tmp0
+  store i32 %tmp1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
new file mode 100644
index 00000000000..bd26c302fe5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
+
+; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i32:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_flbit_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr, align 4
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32:
+; SI: buffer_load_dwordx2
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
+  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
+  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32:
+; SI: buffer_load_dwordx4
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
+  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
+  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll
new file mode 100644
index 00000000000..0a031c5e24d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@@ -0,0 +1,300 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
+declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
+
+; FUNC-LABEL: {{^}}s_ctpop_i32:
+; GCN: s_load_dword [[SVAL:s[0-9]+]],
+; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
+; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; GCN: buffer_store_dword [[VRESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  store i32 %ctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XXX - Why 0 in register?
+; FUNC-LABEL: {{^}}v_ctpop_i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  store i32 %ctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
+; GCN: buffer_load_dword [[VAL1:v[0-9]+]],
+; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
+  %val0 = load i32, i32 addrspace(1)* %in0, align 4
+  %val1 = load i32, i32 addrspace(1)* %in1, align 4
+  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+  %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
+  %add = add i32 %ctpop0, %ctpop1
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
+; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
+; GCN-NEXT: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
+  %val0 = load i32, i32 addrspace(1)* %in0, align 4
+  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+  %add = add i32 %ctpop0, %sval
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_v2i32:
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
+  %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
+  store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_v4i32:
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
+  %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
+  store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_v8i32:
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
+  %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
+  store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_v16i32:
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
+  %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
+  store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %ctpop, 4
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 4, %ctpop
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
+; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %ctpop, 99999
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %ctpop, %const
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %const, %ctpop
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}}
+; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:16
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
+  %const = load i32, i32 addrspace(1)* %gep, align 4
+  %add = add i32 %const, %ctpop
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: We currently disallow SALU instructions in all branches,
+; but there are some cases when the should be allowed.
+
+; FUNC-LABEL: {{^}}ctpop_i32_in_br:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
+; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
+; GCN: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[VAL]]
+; GCN: v_mov_b32_e32 [[RESULT]], [[SRESULT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+; EG: BCNT_INT
+define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
+entry:
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %else
+
+if:
+  %tmp2 = call i32 @llvm.ctpop.i32(i32 %ctpop_arg)
+  br label %endif
+
+else:
+  %tmp3 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %tmp4 = load i32, i32 addrspace(1)* %tmp3
+  br label %endif
+
+endif:
+  %tmp5 = phi i32 [%tmp2, %if], [%tmp4, %else]
+  store i32 %tmp5, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll
new file mode 100644
index 00000000000..e1a0ee3ea21
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ctpop64.ll
@@ -0,0 +1,124 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
+declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
+declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
+declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
+
+; FUNC-LABEL: {{^}}s_ctpop_i64:
+; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
+; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; GCN: buffer_store_dword [[VRESULT]],
+; GCN: s_endpgm
+define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
+  %truncctpop = trunc i64 %ctpop to i32
+  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_i64:
+; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
+; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %val = load i64, i64 addrspace(1)* %in, align 8
+  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
+  %truncctpop = trunc i64 %ctpop to i32
+  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_ctpop_v2i64:
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_endpgm
+define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
+  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
+  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
+  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_ctpop_v4i64:
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_endpgm
+define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
+  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
+  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
+  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_v2i64:
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: s_endpgm
+define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
+  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
+  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
+  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_v4i64:
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: s_endpgm
+define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
+  %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
+  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
+  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
+  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FIXME: We currently disallow SALU instructions in all branches,
+; but there are some cases when the should be allowed.
+
+; FUNC-LABEL: {{^}}ctpop_i64_in_br:
+; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
+; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
+; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
+; GCN: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
+; GCN: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
+; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
+; GCN: s_endpgm
+define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
+entry:
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %else
+
+if:
+  %tmp2 = call i64 @llvm.ctpop.i64(i64 %ctpop_arg)
+  br label %endif
+
+else:
+  %tmp3 = getelementptr i64, i64 addrspace(1)* %in, i32 1
+  %tmp4 = load i64, i64 addrspace(1)* %tmp3
+  br label %endif
+
+endif:
+  %tmp5 = phi i64 [%tmp2, %if], [%tmp4, %else]
+  store i64 %tmp5, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
new file mode 100644
index 00000000000..56fcb51fe14
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
+declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
+declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
+
+; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_ff1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %cttz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr, align 4
+  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %cttz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32:
+; SI: buffer_load_dwordx2
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
+  %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
+  store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32:
+; SI: buffer_load_dwordx4
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
+  %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
+  store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
new file mode 100644
index 00000000000..3399d9da29e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -0,0 +1,196 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}load_i8_to_f32:
+; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]],
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
+; SI: buffer_store_dword [[CONV]],
+define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+  %load = load i8, i8 addrspace(1)* %in, align 1
+  %cvt = uitofp i8 %load to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}load_v2i8_to_v2f32:
+; SI: buffer_load_ushort [[LOADREG:v[0-9]+]],
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI-NOT: and
+; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
+  %cvt = uitofp <2 x i8> %load to <2 x float>
+  store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}load_v3i8_to_v3f32:
+; SI-NOT: bfe
+; SI-NOT: v_cvt_f32_ubyte3_e32
+; SI-DAG: v_cvt_f32_ubyte2_e32
+; SI-DAG: v_cvt_f32_ubyte1_e32
+; SI-DAG: v_cvt_f32_ubyte0_e32
+; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
+  %cvt = uitofp <3 x i8> %load to <3 x float>
+  store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}load_v4i8_to_v4f32:
+; SI: buffer_load_dword [[LOADREG:v[0-9]+]]
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %cvt = uitofp <4 x i8> %load to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; This should not be adding instructions to shift into the correct
+; position in the word for the component.
+
+; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
+; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
+; SI-NOT: v_lshlrev_b32
+; SI-NOT: v_or_b32
+
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]]
+
+; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
+  %cvt = uitofp <4 x i8> %load to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; XXX - This should really still be able to use the v_cvt_f32_ubyte0
+; for each component, but computeKnownBits doesn't handle vectors very
+; well.
+
+; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: v_cvt_f32_ubyte0_e32
+; SI: v_cvt_f32_ubyte0_e32
+; SI: v_cvt_f32_ubyte0_e32
+; SI: v_cvt_f32_ubyte0_e32
+
+; XXX - replace with this when v4i8 loads aren't scalarized anymore.
+; XSI: buffer_load_dword
+; XSI: v_cvt_f32_u32_e32
+; XSI: v_cvt_f32_u32_e32
+; XSI: v_cvt_f32_u32_e32
+; XSI: v_cvt_f32_u32_e32
+; SI: s_endpgm
+define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %cvt = uitofp <4 x i8> %load to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
+  store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
+  ret void
+}
+
+; Make sure this doesn't crash.
+; SI-LABEL: {{^}}load_v7i8_to_v7f32:
+; SI: s_endpgm
+define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
+  %cvt = uitofp <7 x i8> %load to <7 x float>
+  store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}load_v8i8_to_v8f32:
+; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
+  %cvt = uitofp <8 x i8> %load to <8 x float>
+  store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
+; SI: buffer_load_dword [[LOADREG:v[0-9]+]],
+; SI: v_add_i32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]]
+; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
+; SI: buffer_store_dword [[CONV]],
+define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 2
+  %inreg = and i32 %add, 255
+  %cvt = uitofp i32 %inreg to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
+define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %inreg = and i32 %load, 65280
+  %shr = lshr i32 %inreg, 8
+  %cvt = uitofp i32 %shr to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; We don't get these ones because of the zext, but instcombine removes
+; them so it shouldn't really matter.
+define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+  %load = load i8, i8 addrspace(1)* %in, align 1
+  %ext = zext i8 %load to i32
+  %cvt = uitofp i32 %ext to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
+  %ext = zext <4 x i8> %load to <4 x i32>
+  %cvt = uitofp <4 x i32> %ext to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll b/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
new file mode 100644
index 00000000000..2dd3a9f2a77
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
@@ -0,0 +1,86 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.floor.f32(float) #1
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_0:
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NOT: add
+; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+  %floor = call float @llvm.floor.f32(float %x) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_1:
+; SI: v_add_f32_e64 [[TMP:v[0-9]+]], 1.0, s{{[0-9]+}}
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]]
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
+  %fadd = fadd float %x, 1.0
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs:
+; SI-NOT: add
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %floor = call float @llvm.floor.f32(float %x.fabs) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fneg:
+; SI-NOT: add
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fneg = fsub float -0.000000e+00, %x
+  %floor = call float @llvm.floor.f32(float %x.fneg) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs_fneg:
+; SI-NOT: add
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
+  %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}no_cvt_flr_i32_f32_0:
+; SI-NOT: v_cvt_flr_i32_f32
+; SI: v_floor_f32
+; SI: v_cvt_u32_f32_e32
+; SI: s_endpgm
+define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+  %floor = call float @llvm.floor.f32(float %x) #1
+  %cvt = fptoui float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
new file mode 100644
index 00000000000..864ac40260b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.floor.f32(float) #1
+
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32:
+; SI-SAFE-NOT: v_cvt_rpi_i32_f32
+; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
+  %fadd = fadd float %x, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs:
+; SI-SAFE-NOT: v_cvt_rpi_i32_f32
+; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %fadd = fadd float %x.fabs, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This doesn't work because it forms fsub 0.5, x
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fneg:
+; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
+; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, s{{[0-9]+}}
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fneg = fsub float -0.000000e+00, %x
+  %fadd = fadd float %x.fneg, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This doesn't work for same reason as above
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs_fneg:
+; SI-SAFE-NOT: v_cvt_rpi_i32_f32
+; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
+
+; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, |s{{[0-9]+}}|
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
+  %fadd = fadd float %x.fabs.fneg, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}no_cvt_rpi_i32_f32_0:
+; SI-NOT: v_cvt_rpi_i32_f32
+; SI: v_add_f32
+; SI: v_floor_f32
+; SI: v_cvt_u32_f32
+; SI: s_endpgm
+define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+  %fadd = fadd float %x, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptoui float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
new file mode 100644
index 00000000000..fb43ff4fbdd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
@@ -0,0 +1,36 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This test is for a bug in
+; DAGCombiner::reduceBuildVecConvertToConvertBuildVec() where
+; the wrong type was being passed to
+; TargetLowering::getOperationAction() when checking the legality of
+; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes.
+
+
+; CHECK: {{^}}sint:
+; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %sint = load i32, i32 addrspace(1) * %in
+  %conv = sitofp i32 %sint to float
+  %0 = insertelement <4 x float> undef, float %conv, i32 0
+  %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
+  store <4 x float> %splat, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+;CHECK: {{^}}uint:
+;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %uint = load i32, i32 addrspace(1) * %in
+  %conv = uitofp i32 %uint to float
+  %0 = insertelement <4 x float> undef, float %conv, i32 0
+  %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
+  store <4 x float> %splat, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/debug.ll b/test/CodeGen/AMDGPU/debug.ll
new file mode 100644
index 00000000000..a2e0e878b74
--- /dev/null
+++ b/test/CodeGen/AMDGPU/debug.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; Test for a crash in the custom assembly dump code.
+
+; SI: s_endpgm
+define void @test(i32 addrspace(1)* %out) {
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/default-fp-mode.ll b/test/CodeGen/AMDGPU/default-fp-mode.ll
new file mode 100644
index 00000000000..da8e91454b9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/default-fp-mode.ll
@@ -0,0 +1,36 @@
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_kernel:
+
+; DEFAULT: FloatMode: 192
+; DEFAULT: IeeeMode: 0
+
+; FP64-DENORMAL: FloatMode: 192
+; FP64-DENORMAL: IeeeMode: 0
+
+; FP32-DENORMAL: FloatMode: 48
+; FP32-DENORMAL: IeeeMode: 0
+
+; BOTH-DENORMAL: FloatMode: 240
+; BOTH-DENORMAL: IeeeMode: 0
+
+; NO-DENORMAL: FloatMode: 0
+; NO-DENORMAL: IeeeMode: 0
+define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll b/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
new file mode 100644
index 00000000000..cdd2c0cd4f4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; PRED_SET* instructions must be tied to any instruction that uses their
+; result.  This tests that there are no instructions between the PRED_SET*
+; and the PREDICATE_BREAK in this loop.
+
+; CHECK: {{^}}loop_ge:
+; CHECK: LOOP_START_DX10
+; CHECK: ALU_PUSH_BEFORE
+; CHECK-NEXT: JUMP
+; CHECK-NEXT: LOOP_BREAK
+define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
+entry:
+  %cmp5 = icmp sgt i32 %iterations, 0
+  br i1 %cmp5, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ]
+  %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %i.07 = add nsw i32 %i.07.in, -1
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06
+  store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4
+  %add = add nsw i32 %ai.06, 1
+  %exitcond = icmp eq i32 %add, %iterations
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/dot4-folding.ll b/test/CodeGen/AMDGPU/dot4-folding.ll
new file mode 100644
index 00000000000..4df7b63bf98
--- /dev/null
+++ b/test/CodeGen/AMDGPU/dot4-folding.ll
@@ -0,0 +1,27 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Exactly one constant vector can be folded into dot4, which means exactly
+; 4 MOV instructions
+; CHECK: {{^}}main:
+; CHECK: MOV
+; CHECK: MOV
+; CHECK: MOV
+; CHECK: MOV
+; CHECK-NOT: MOV
+; CHECK-NOT: MOV
+; CHECK-NOT: MOV
+; CHECK-NOT: MOV
+
+define void @main(float addrspace(1)* %out) {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
+  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %2 = call float @llvm.AMDGPU.dp4(<4 x float> %0,<4 x float> %1)
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %3, i32 0, i32 0)
+  ret void
+}
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
new file mode 100644
index 00000000000..e7e13d6178c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -0,0 +1,69 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare void @llvm.AMDGPU.barrier.local() #1
+
+; Function Attrs: nounwind
+; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop:
+; CHECK: BB0_1:
+; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]],
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]]
+; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], 4, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], 0x80, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], 0x84, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x84]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], 0x100, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
+
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:1
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:33
+; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
+; CHECK: s_endpgm
+define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
+entry:
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #0
+  %mul = shl nsw i32 %x.i, 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ]
+  %offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ]
+  %k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  tail call void @llvm.AMDGPU.barrier.local() #1
+  %arrayidx = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %offset.02
+  %tmp = load float, float addrspace(3)* %arrayidx, align 4
+  %add1 = add nsw i32 %offset.02, 1
+  %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add1
+  %tmp1 = load float, float addrspace(3)* %arrayidx2, align 4
+  %add3 = add nsw i32 %offset.02, 32
+  %arrayidx4 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add3
+  %tmp2 = load float, float addrspace(3)* %arrayidx4, align 4
+  %add5 = add nsw i32 %offset.02, 33
+  %arrayidx6 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add5
+  %tmp3 = load float, float addrspace(3)* %arrayidx6, align 4
+  %add7 = add nsw i32 %offset.02, 64
+  %arrayidx8 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add7
+  %tmp4 = load float, float addrspace(3)* %arrayidx8, align 4
+  %add9 = fadd float %tmp, %tmp1
+  %add10 = fadd float %add9, %tmp2
+  %add11 = fadd float %add10, %tmp3
+  %add12 = fadd float %add11, %tmp4
+  %add13 = fadd float %sum.03, %add12
+  %inc = add nsw i32 %k.01, 1
+  %add14 = add nsw i32 %offset.02, 97
+  %exitcond = icmp eq i32 %inc, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %tmp5 = sext i32 %x.i to i64
+  %arrayidx15 = getelementptr inbounds float, float addrspace(1)* %out, i64 %tmp5
+  store float %add13, float addrspace(1)* %arrayidx15, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { noduplicate nounwind }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll
new file mode 100644
index 00000000000..5929898f8bd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@@ -0,0 +1,515 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+
+; FIXME: We don't get cases where the address was an SGPR because we
+; get a copy to the address register for each one.
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+ @lds.f64 = addrspace(3) global [512 x double] undef, align 8
+
+; SI-LABEL: @simple_read2_f32
+; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2_f32(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_max_offset
+; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 255
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_too_far
+; SI-NOT ds_read2_b32
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
+; SI: s_endpgm
+define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 257
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_x2
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 0
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum.0 = fadd float %val0, %val1
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
+  %sum.1 = fadd float %val2, %val3
+
+  %sum = fadd float %sum.0, %sum.1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Make sure there is an instruction between the two sets of reads.
+; SI-LABEL: @simple_read2_f32_x2_barrier
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
+; SI: s_barrier
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 0
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum.0 = fadd float %val0, %val1
+
+  call void @llvm.AMDGPU.barrier.local() #2
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
+  %sum.1 = fadd float %val2, %val3
+
+  %sum = fadd float %sum.0, %sum.1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; For some reason adding something to the base address for the first
+; element results in only folding the inner pair.
+
+; SI-LABEL: @simple_read2_f32_x2_nonzero_base
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum.0 = fadd float %val0, %val1
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
+  %sum.1 = fadd float %val2, %val3
+
+  %sum = fadd float %sum.0, %sum.1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Be careful of vectors of pointers. We don't know if the 2 pointers
+; in the vectors are really the same base, so this is not safe to
+; merge.
+; Base pointers come from different subregister of same super
+; register. We can't safely merge this.
+
+; SI-LABEL: @read2_ptr_is_subreg_arg_f32
+; SI-NOT: ds_read2_b32
+; SI: ds_read_b32
+; SI: ds_read_b32
+; SI: s_endpgm
+define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
+  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+  %val0 = load float, float addrspace(3)* %gep.0, align 4
+  %val1 = load float, float addrspace(3)* %gep.1, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Apply a constant scalar offset after the pointer vector extract.  We
+; are rejecting merges that have the same, constant 0 offset, so make
+; sure we are really rejecting it because of the different
+; subregisters.
+
+; SI-LABEL: @read2_ptr_is_subreg_arg_offset_f32
+; SI-NOT: ds_read2_b32
+; SI: ds_read_b32
+; SI: ds_read_b32
+; SI: s_endpgm
+define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
+  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+
+  ; Apply an additional offset after the vector that will be more obviously folded.
+  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
+
+  %val0 = load float, float addrspace(3)* %gep.0, align 4
+  %val1 = load float, float addrspace(3)* %gep.1.offset, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; We should be able to merge in this case, but probably not worth the effort.
+; SI-NOT: ds_read2_b32
+; SI: ds_read_b32
+; SI: ds_read_b32
+; SI: s_endpgm
+define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
+  %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1
+  %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1
+  %idx = add <2 x i32> %x.i.v.1, <i32 0, i32 8>
+  %gep = getelementptr inbounds [512 x float], <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+  %val0 = load float, float addrspace(3)* %gep.0, align 4
+  %val1 = load float, float addrspace(3)* %gep.1, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_volatile_0
+; SI-NOT ds_read2_b32
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_volatile_1
+; SI-NOT ds_read2_b32
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load volatile float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Can't fold since not correctly aligned.
+; XXX: This isn't really testing anything useful now. I think CI
+; allows unaligned LDS accesses, which would be a problem here.
+; SI-LABEL: @unaligned_read2_f32
+; SI-NOT: ds_read2_b32
+; SI: s_endpgm
+define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 1
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 1
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @misaligned_2_simple_read2_f32
+; SI-NOT: ds_read2_b32
+; SI: s_endpgm
+define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 2
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 2
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f64
+; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
+; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2_f64(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f64_max_offset
+; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
+; SI: s_endpgm
+define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 255
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f64_too_far
+; SI-NOT ds_read2_b64
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
+; SI: s_endpgm
+define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 257
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; Alignment only 4
+; SI-LABEL: @misaligned_read2_f64
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
+; SI: s_endpgm
+define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 7
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 4
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+@foo = addrspace(3) global [4 x i32] undef, align 4
+
+; SI-LABEL: @load_constant_adjacent_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
+define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
+  %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
+  %sum = add i32 %val0, %val1
+  store i32 %sum, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @load_constant_disjoint_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
+define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
+  %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
+  %sum = add i32 %val0, %val1
+  store i32 %sum, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+@bar = addrspace(3) global [4 x i64] undef, align 4
+
+; SI-LABEL: @load_misaligned64_constant_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
+define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
+  %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
+  %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
+  %sum = add i64 %val0, %val1
+  store i64 %sum, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+@bar.large = addrspace(3) global [4096 x i64] undef, align 4
+
+; SI-LABEL: @load_misaligned64_constant_large_offsets
+; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
+; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
+; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
+; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
+; SI: s_endpgm
+define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
+  %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
+  %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
+  %sum = add i64 %val0, %val1
+  store i64 %sum, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
+@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
+
+define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
+  %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
+  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
+  %tmp16 = load float, float addrspace(3)* %arrayidx44, align 4
+  %add47 = add nsw i32 %x.i, 1
+  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
+  %tmp17 = load float, float addrspace(3)* %arrayidx48, align 4
+  %add51 = add nsw i32 %x.i, 16
+  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
+  %tmp18 = load float, float addrspace(3)* %arrayidx52, align 4
+  %add55 = add nsw i32 %x.i, 17
+  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
+  %tmp19 = load float, float addrspace(3)* %arrayidx56, align 4
+  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
+  %tmp20 = load float, float addrspace(3)* %arrayidx60, align 4
+  %add63 = add nsw i32 %y.i, 1
+  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
+  %tmp21 = load float, float addrspace(3)* %arrayidx64, align 4
+  %add67 = add nsw i32 %y.i, 32
+  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
+  %tmp22 = load float, float addrspace(3)* %arrayidx68, align 4
+  %add71 = add nsw i32 %y.i, 33
+  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
+  %tmp23 = load float, float addrspace(3)* %arrayidx72, align 4
+  %add75 = add nsw i32 %y.i, 64
+  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
+  %tmp24 = load float, float addrspace(3)* %arrayidx76, align 4
+  %add79 = add nsw i32 %y.i, 65
+  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
+  %tmp25 = load float, float addrspace(3)* %arrayidx80, align 4
+  %sum.0 = fadd float %tmp16, %tmp17
+  %sum.1 = fadd float %sum.0, %tmp18
+  %sum.2 = fadd float %sum.1, %tmp19
+  %sum.3 = fadd float %sum.2, %tmp20
+  %sum.4 = fadd float %sum.3, %tmp21
+  %sum.5 = fadd float %sum.4, %tmp22
+  %sum.6 = fadd float %sum.5, %tmp23
+  %sum.7 = fadd float %sum.6, %tmp24
+  %sum.8 = fadd float %sum.7, %tmp25
+  store float %sum.8, float addrspace(1)* %C, align 4
+  ret void
+}
+
+define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
+  %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
+  store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
+  %load = load i64, i64 addrspace(3)* %in, align 4
+  store i64 %load, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
new file mode 100644
index 00000000000..9ea9a5a2617
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+
+; XFAIL: *
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+
+; SI-LABEL: {{^}}offset_order:
+
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56
+; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:0 offset1:4
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:1
+
+define void @offset_order(float addrspace(1)* %out) {
+entry:
+  %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0
+  %val0 = load float, float addrspace(3)* %ptr0
+
+  %ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 256
+  %val1 = load float, float addrspace(3)* %ptr1
+  %add1 = fadd float %val0, %val1
+
+  %ptr2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 3
+  %val2 = load float, float addrspace(3)* %ptr2
+  %add2 = fadd float %add1, %val2
+
+  %ptr3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
+  %val3 = load float, float addrspace(3)* %ptr3
+  %add3 = fadd float %add2, %val3
+
+  %ptr4 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12
+  %val4 = load float, float addrspace(3)* %ptr4
+  %add4 = fadd float %add3, %val4
+
+  %ptr5 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 14
+  %val5 = load float, float addrspace(3)* %ptr5
+  %add5 = fadd float %add4, %val5
+
+  %ptr6 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
+  %val6 = load float, float addrspace(3)* %ptr6
+  %add6 = fadd float %add5, %val6
+  store float %add6, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll
new file mode 100644
index 00000000000..54b3b45636d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -0,0 +1,272 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+@lds.f64 = addrspace(3) global [512 x double] undef, align 8
+
+
+; SI-LABEL: @simple_read2st64_f32_0_1
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f32_1_2
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 128
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f32_max_offset
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 16320
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f32_over_max_offset
+; SI-NOT: ds_read2st64_b32
+; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
+; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 16384
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @odd_invalid_read2st64_f32_0
+; SI-NOT: ds_read2st64_b32
+; SI: s_endpgm
+define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 63
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @odd_invalid_read2st64_f32_1
+; SI-NOT: ds_read2st64_b32
+; SI: s_endpgm
+define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 127
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f64_0_1
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f64_1_2
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 128
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; Alignment only
+
+; SI-LABEL: @misaligned_read2st64_f64
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
+; SI: s_endpgm
+define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 4
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff
+; SI-LABEL: @simple_read2st64_f64_max_offset
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 256
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8128
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f64_over_max_offset
+; SI-NOT: ds_read2st64_b64
+; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8192
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @invalid_read2st64_f64_odd_offset
+; SI-NOT: ds_read2st64_b64
+; SI: s_endpgm
+define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8129
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; The stride of 8 elements is 8 * 8 bytes. We need to make sure the
+; stride in elements, not bytes, is a multiple of 64.
+
+; SI-LABEL: @byte_size_only_divisible_64_read2_f64
+; SI-NOT: ds_read2st_b64
+; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
+; SI: s_endpgm
+define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll
new file mode 100644
index 00000000000..b553d3459e4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds_write2.ll
@@ -0,0 +1,425 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+@lds.f64 = addrspace(3) global [512 x double] undef, align 8
+
+
+; SI-LABEL: @simple_write2_one_val_f32
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
+; SI: s_endpgm
+define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %val = load float, float addrspace(1)* %in.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 
+; SI: s_endpgm
+define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_volatile_0
+; SI-NOT: ds_write2_b32
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_volatile_1
+; SI-NOT: ds_write2_b32
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; 2 data subregisters from different super registers.
+; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32
+; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
+; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
+; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
+; SI: s_endpgm
+define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
+  %val0 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
+  %val1 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
+  %val0.0 = extractelement <2 x float> %val0, i32 0
+  %val1.1 = extractelement <2 x float> %val1, i32 1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0.0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1.1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_subreg2_f32
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
+; SI: s_endpgm
+define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
+  %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
+  %val0 = extractelement <2 x float> %val, i32 0
+  %val1 = extractelement <2 x float> %val, i32 1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_subreg4_f32
+; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
+; SI: s_endpgm
+define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
+  %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
+  %val0 = extractelement <4 x float> %val, i32 0
+  %val1 = extractelement <4 x float> %val, i32 3
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_max_offset_f32
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
+; SI: s_endpgm
+define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 255
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_too_far_f32
+; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
+; SI: s_endpgm
+define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 257
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_x2
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
+; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+
+  %idx.0 = add nsw i32 %tid.x, 0
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  store float %val0, float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  store float %val1, float addrspace(3)* %arrayidx3, align 4
+
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
+; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+
+  %idx.0 = add nsw i32 %tid.x, 3
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  store float %val0, float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  store float %val1, float addrspace(3)* %arrayidx3, align 4
+
+  ret void
+}
+
+; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32
+; SI-NOT: ds_write2_b32
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: s_endpgm
+define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+
+  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
+  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+
+  ; Apply an additional offset after the vector that will be more obviously folded.
+  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
+  store float %val0, float addrspace(3)* %gep.0, align 4
+
+  %add.x = add nsw i32 %x.i, 8
+  store float %val1, float addrspace(3)* %gep.1.offset, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_one_val_f64
+; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
+; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
+; SI: s_endpgm
+define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %val = load double, double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  store double %val, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  store double %val, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+; SI-LABEL: @misaligned_simple_write2_one_val_f64
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
+; SI: s_endpgm
+define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %val = load double, double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
+  store double %val, double addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 7
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
+  store double %val, double addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f64
+; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
+; SI: s_endpgm
+define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
+  %val0 = load double, double addrspace(1)* %in.gep.0, align 8
+  %val1 = load double, double addrspace(1)* %in.gep.1, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  store double %val0, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  store double %val1, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+@foo = addrspace(3) global [4 x i32] undef, align 4
+
+; SI-LABEL: @store_constant_adjacent_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+define void @store_constant_adjacent_offsets() {
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
+  ret void
+}
+
+; SI-LABEL: @store_constant_disjoint_offsets
+; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
+; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
+define void @store_constant_disjoint_offsets() {
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
+  ret void
+}
+
+@bar = addrspace(3) global [4 x i64] undef, align 4
+
+; SI-LABEL: @store_misaligned64_constant_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
+define void @store_misaligned64_constant_offsets() {
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
+  ret void
+}
+
+@bar.large = addrspace(3) global [4096 x i64] undef, align 4
+
+; SI-LABEL: @store_misaligned64_constant_large_offsets
+; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
+; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
+; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; SI: s_endpgm
+define void @store_misaligned64_constant_large_offsets() {
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
+  ret void
+}
+
+@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
+@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
+
+define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
+  %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
+  %val = load float, float addrspace(1)* %in
+  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
+  store float %val, float addrspace(3)* %arrayidx44, align 4
+  %add47 = add nsw i32 %x.i, 1
+  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
+  store float %val, float addrspace(3)* %arrayidx48, align 4
+  %add51 = add nsw i32 %x.i, 16
+  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
+  store float %val, float addrspace(3)* %arrayidx52, align 4
+  %add55 = add nsw i32 %x.i, 17
+  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
+  store float %val, float addrspace(3)* %arrayidx56, align 4
+  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
+  store float %val, float addrspace(3)* %arrayidx60, align 4
+  %add63 = add nsw i32 %y.i, 1
+  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
+  store float %val, float addrspace(3)* %arrayidx64, align 4
+  %add67 = add nsw i32 %y.i, 32
+  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
+  store float %val, float addrspace(3)* %arrayidx68, align 4
+  %add71 = add nsw i32 %y.i, 33
+  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
+  store float %val, float addrspace(3)* %arrayidx72, align 4
+  %add75 = add nsw i32 %y.i, 64
+  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
+  store float %val, float addrspace(3)* %arrayidx76, align 4
+  %add79 = add nsw i32 %y.i, 65
+  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
+  store float %val, float addrspace(3)* %arrayidx80, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll
new file mode 100644
index 00000000000..1d9d881c5c7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -0,0 +1,119 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+
+
+; SI-LABEL: @simple_write2st64_one_val_f32_0_1
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
+; SI: s_endpgm
+define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %val = load float, float addrspace(1)* %in.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2st64_two_val_f32_2_5
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
+; SI: s_endpgm
+define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %add.x.0 = add nsw i32 %x.i, 128
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 320
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2st64_two_val_max_offset_f32
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
+; SI: s_endpgm
+define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 16320
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2st64_two_val_max_offset_f64
+; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
+; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
+; SI: s_endpgm
+define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
+  %val0 = load double, double addrspace(1)* %in.gep.0, align 8
+  %val1 = load double, double addrspace(1)* %in.gep.1, align 8
+  %add.x.0 = add nsw i32 %x.i, 256
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
+  store double %val0, double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8128
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
+  store double %val1, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64
+; SI-NOT: ds_write2st64_b64
+; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
+; SI: s_endpgm
+define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %val = load double, double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
+  store double %val, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
+  store double %val, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/elf.ll b/test/CodeGen/AMDGPU/elf.ll
new file mode 100644
index 00000000000..d0fd06a3437
--- /dev/null
+++ b/test/CodeGen/AMDGPU/elf.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s
+; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
+
+; Test that we don't try to produce a COFF file on windows
+; RUN: llc < %s -mtriple=amdgcn-pc-mingw -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
+
+; ELF: Format: ELF32
+; ELF: Name: .AMDGPU.config
+; ELF: Type: SHT_PROGBITS
+
+; ELF: Symbol {
+; ELF: Name: test
+; ELF: Binding: Global
+
+; CONFIG: .section .AMDGPU.config
+; CONFIG-NEXT: .long   45096
+; TYPICAL-NEXT: .long   0
+; TONGA-NEXT: .long   576
+; CONFIG: .align 256
+; CONFIG: test:
+define void @test(i32 %p) #0 {
+   %i = add i32 %p, 2
+   %r = bitcast i32 %i to float
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
+   ret void
+}
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" } ; Pixel Shader
diff --git a/test/CodeGen/AMDGPU/elf.r600.ll b/test/CodeGen/AMDGPU/elf.r600.ll
new file mode 100644
index 00000000000..51cd0850093
--- /dev/null
+++ b/test/CodeGen/AMDGPU/elf.r600.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood -filetype=obj | llvm-readobj -s - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -march=r600 -mcpu=redwood -o - | FileCheck --check-prefix=CONFIG %s
+
+; ELF: Format: ELF32
+; ELF: Name: .AMDGPU.config
+
+; CONFIG: .section .AMDGPU.config
+; CONFIG-NEXT: .long   166100
+; CONFIG-NEXT: .long   2
+; CONFIG-NEXT: .long   165900
+; CONFIG-NEXT: .long   0
+define void @test(float addrspace(1)* %out, i32 %p) {
+   %i = add i32 %p, 2
+   %r = bitcast i32 %i to float
+   store float %r, float addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/AMDGPU/empty-function.ll b/test/CodeGen/AMDGPU/empty-function.ll
new file mode 100644
index 00000000000..a060900811e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/empty-function.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; Make sure we don't assert on empty functions
+
+; SI: .text
+; SI-LABEL: {{^}}empty_function_ret:
+; SI: s_endpgm
+; SI: codeLenInByte = 4
+define void @empty_function_ret() #0 {
+  ret void
+}
+
+; SI: .text
+; SI-LABEL: {{^}}empty_function_unreachable:
+; SI: codeLenInByte = 0
+define void @empty_function_unreachable() #0 {
+  unreachable
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/endcf-loop-header.ll b/test/CodeGen/AMDGPU/endcf-loop-header.ll
new file mode 100644
index 00000000000..267a323c506
--- /dev/null
+++ b/test/CodeGen/AMDGPU/endcf-loop-header.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; This tests that the llvm.SI.end.cf intrinsic is not inserted into the
+; loop block.  This intrinsic will be lowered to s_or_b64 by the code
+; generator.
+
+; CHECK-LABEL: {{^}}test:
+
+; This is was lowered from the llvm.SI.end.cf intrinsic:
+; CHECK: s_or_b64 exec, exec
+
+; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}}
+; CHECK-NOT: s_or_b64 exec, exec
+; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
+define void @test(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %loop
+
+if:
+  store i32 0, i32 addrspace(1)* %out
+  br label %loop
+
+loop:
+  %tmp1 = phi i32 [0, %entry], [0, %if], [%inc, %loop]
+  %inc = add i32 %tmp1, %cond
+  %tmp2 = icmp ugt i32 %inc, 10
+  br i1 %tmp2, label %done, label %loop
+
+done:
+  %tmp3 = getelementptr i32, i32 addrspace(1)* %out, i64 1
+  store i32 %inc, i32 addrspace(1)* %tmp3
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/extload-private.ll b/test/CodeGen/AMDGPU/extload-private.ll
new file mode 100644
index 00000000000..294c3a9c678
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extload-private.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}load_i8_sext_private:
+; SI: buffer_load_sbyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+define void @load_i8_sext_private(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca i8
+  %tmp1 = load i8, i8* %tmp0
+  %tmp2 = sext i8 %tmp1 to i32
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i8_zext_private:
+; SI: buffer_load_ubyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+define void @load_i8_zext_private(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca i8
+  %tmp1 = load i8, i8* %tmp0
+  %tmp2 = zext i8 %tmp1 to i32
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i16_sext_private:
+; SI: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+define void @load_i16_sext_private(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca i16
+  %tmp1 = load i16, i16* %tmp0
+  %tmp2 = sext i16 %tmp1 to i32
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i16_zext_private:
+; SI: buffer_load_ushort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+define void @load_i16_zext_private(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca i16
+  %tmp1 = load i16, i16* %tmp0
+  %tmp2 = zext i16 %tmp1 to i32
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/extload.ll b/test/CodeGen/AMDGPU/extload.ll
new file mode 100644
index 00000000000..662eb7a9716
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extload.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}anyext_load_i8:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
+; EG: VTX_READ_32 [[VAL]]
+
+define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
+  %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
+  %load = load i32, i32 addrspace(1)* %cast, align 1
+  %x = bitcast i32 %load to <4 x i8>
+  %castOut = bitcast i8 addrspace(1)* %out to <4 x i8> addrspace(1)*
+  store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}anyext_load_i16:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
+; EG: VTX_READ_32 [[VAL]]
+
+define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
+  %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
+  %load = load i32, i32 addrspace(1)* %cast, align 1
+  %x = bitcast i32 %load to <2 x i16>
+  %castOut = bitcast i16 addrspace(1)* %out to <2 x i16> addrspace(1)*
+  store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}anyext_load_lds_i8:
+; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
+; EG: LDS_WRITE * [[VAL]]
+define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
+  %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
+  %load = load i32, i32 addrspace(3)* %cast, align 1
+  %x = bitcast i32 %load to <4 x i8>
+  %castOut = bitcast i8 addrspace(3)* %out to <4 x i8> addrspace(3)*
+  store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}anyext_load_lds_i16:
+; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
+; EG: LDS_WRITE * [[VAL]]
+define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
+  %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
+  %load = load i32, i32 addrspace(3)* %cast, align 1
+  %x = bitcast i32 %load to <2 x i16>
+  %castOut = bitcast i16 addrspace(3)* %out to <2 x i16> addrspace(3)*
+  store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut, align 1
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll b/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll
new file mode 100644
index 00000000000..c7572efc6f5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v2i16:
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_store_short
+; SI: buffer_store_short
+define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind {
+  %p0 = extractelement <2 x i16> %foo, i32 0
+  %p1 = extractelement <2 x i16> %foo, i32 1
+  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+  store i16 %p1, i16 addrspace(1)* %out, align 2
+  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v4i16:
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_store_short
+; SI: buffer_store_short
+define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind {
+  %p0 = extractelement <4 x i16> %foo, i32 0
+  %p1 = extractelement <4 x i16> %foo, i32 2
+  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+  store i16 %p1, i16 addrspace(1)* %out, align 2
+  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fabs.f64.ll b/test/CodeGen/AMDGPU/fabs.f64.ll
new file mode 100644
index 00000000000..3c6136c1a7b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -0,0 +1,97 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+declare double @fabs(double) readnone
+declare double @llvm.fabs.f64(double) readnone
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone
+declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
+
+; FUNC-LABEL: {{^}}v_fabs_f64:
+; SI: v_and_b32
+; SI: s_endpgm
+define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tidext = sext i32 %tid to i64
+  %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext
+  %val = load double, double addrspace(1)* %gep, align 8
+  %fabs = call double @llvm.fabs.f64(double %val)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_f64:
+; SI: v_and_b32
+; SI-NOT: v_and_b32
+; SI: s_endpgm
+define void @fabs_f64(double addrspace(1)* %out, double %in) {
+  %fabs = call double @llvm.fabs.f64(double %in)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_v2f64:
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+  %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
+  store <2 x double> %fabs, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_v4f64:
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+  %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
+  store <4 x double> %fabs, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fabs_fold_f64:
+; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NOT: and
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
+; SI: s_endpgm
+define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+  %fabs = call double @llvm.fabs.f64(double %in0)
+  %fmul = fmul double %fabs, %in1
+  store double %fmul, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fabs_fn_fold_f64:
+; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NOT: and
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
+; SI: s_endpgm
+define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+  %fabs = call double @fabs(double %in0)
+  %fmul = fmul double %fabs, %in1
+  store double %fmul, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_free_f64:
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc= bitcast i64 %in to double
+  %fabs = call double @llvm.fabs.f64(double %bc)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_fn_free_f64:
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc= bitcast i64 %in to double
+  %fabs = call double @fabs(double %bc)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fabs.ll b/test/CodeGen/AMDGPU/fabs.ll
new file mode 100644
index 00000000000..419a73d0266
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fabs.ll
@@ -0,0 +1,101 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+
+; DAGCombiner will transform:
+; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
+; unless isFabsFree returns true
+
+; FUNC-LABEL: {{^}}fabs_fn_free:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+
+; GCN: v_and_b32
+
+define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
+  %bc= bitcast i32 %in to float
+  %fabs = call float @fabs(float %bc)
+  store float %fabs, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_free:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+
+; GCN: v_and_b32
+
+define void @fabs_free(float addrspace(1)* %out, i32 %in) {
+  %bc= bitcast i32 %in to float
+  %fabs = call float @llvm.fabs.f32(float %bc)
+  store float %fabs, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+
+; GCN: v_and_b32
+define void @fabs_f32(float addrspace(1)* %out, float %in) {
+  %fabs = call float @llvm.fabs.f32(float %in)
+  store float %fabs, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_v2f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+
+; GCN: v_and_b32
+; GCN: v_and_b32
+define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  store <2 x float> %fabs, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_v4f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+
+; GCN: v_and_b32
+; GCN: v_and_b32
+; GCN: v_and_b32
+; GCN: v_and_b32
+define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+  %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+  store <4 x float> %fabs, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fn_fold:
+; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN-NOT: and
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
+define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
+  %fabs = call float @fabs(float %in0)
+  %fmul = fmul float %fabs, %in1
+  store float %fmul, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fold:
+; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN-NOT: and
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
+define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
+  %fabs = call float @llvm.fabs.f32(float %in0)
+  %fmul = fmul float %fabs, %in1
+  store float %fmul, float addrspace(1)* %out
+  ret void
+}
+
+declare float @fabs(float) readnone
+declare float @llvm.fabs.f32(float) readnone
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/AMDGPU/fadd.ll b/test/CodeGen/AMDGPU/fadd.ll
new file mode 100644
index 00000000000..5fac328c598
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fadd.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+
+; FUNC-LABEL: {{^}}fadd_f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; SI: v_add_f32
+define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) {
+   %add = fadd float %a, %b
+   store float %add, float addrspace(1)* %out, align 4
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fadd_v2f32:
+; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; SI: v_add_f32
+; SI: v_add_f32
+define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+  %add = fadd <2 x float> %a, %b
+  store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fadd_v4f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
+  %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
+  %result = fadd <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fadd_v8f32:
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) {
+  %add = fadd <8 x float> %a, %b
+  store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll
new file mode 100644
index 00000000000..485c55870c4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fadd64.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK: {{^}}fadd_f64:
+; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
+
+define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                      double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fadd double %r0, %r1
+   store double %r2, double addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/AMDGPU/fceil.ll b/test/CodeGen/AMDGPU/fceil.ll
new file mode 100644
index 00000000000..f23e8919d73
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fceil.ll
@@ -0,0 +1,132 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.ceil.f32(float) nounwind readnone
+declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone
+declare <3 x float> @llvm.ceil.v3f32(<3 x float>) nounwind readnone
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
+declare <8 x float> @llvm.ceil.v8f32(<8 x float>) nounwind readnone
+declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone
+
+; FUNC-LABEL: {{^}}fceil_f32:
+; SI: v_ceil_f32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+define void @fceil_f32(float addrspace(1)* %out, float %x) {
+  %y = call float @llvm.ceil.f32(float %x) nounwind readnone
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v2f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+  %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone
+  store <2 x float> %y, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v3f32:
+; FIXME-SI: v_ceil_f32_e32
+; FIXME-SI: v_ceil_f32_e32
+; FIXME-SI: v_ceil_f32_e32
+; FIXME-EG: v3 is treated as v2 and v1, hence 2 stores
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+  %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone
+  store <3 x float> %y, <3 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v4f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+  %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone
+  store <4 x float> %y, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v8f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+  %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone
+  store <8 x float> %y, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v16f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT3:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT4:T[0-9]+]]{{\.[XYZW]}}
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+  %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone
+  store <16 x float> %y, <16 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fceil64.ll b/test/CodeGen/AMDGPU/fceil64.ll
new file mode 100644
index 00000000000..e8c34f0141e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fceil64.ll
@@ -0,0 +1,105 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+declare double @llvm.ceil.f64(double) nounwind readnone
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.ceil.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.ceil.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
+
+; FUNC-LABEL: {{^}}fceil_f64:
+; CI: v_ceil_f64_e32
+; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: s_lshr_b64
+; SI: s_not_b64
+; SI: s_and_b64
+; SI: cmp_gt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_lt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI-DAG: v_cmp_lt_f64
+; SI-DAG: v_cmp_lg_f64
+; SI: s_and_b64
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
+; SI: v_add_f64
+; SI: s_endpgm
+define void @fceil_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.ceil.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v2f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-FUNC-LABEL: {{^}}fceil_v3f64:
+; FIXME-CI: v_ceil_f64_e32
+; FIXME-CI: v_ceil_f64_e32
+; FIXME-CI: v_ceil_f64_e32
+; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}fceil_v4f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v8f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v16f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fcmp-cnd.ll b/test/CodeGen/AMDGPU/fcmp-cnd.ll
new file mode 100644
index 00000000000..530274f920f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcmp-cnd.ll
@@ -0,0 +1,14 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;Not checking arguments 2 and 3 to CNDE, because they may change between
+;registers and literal.x depending on what the optimizer does.
+;CHECK: CNDE  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %0 = load float, float addrspace(1)* %in
+  %cmp = fcmp oeq float %0, 0.000000e+00
+  %value = select i1 %cmp, i32 2, i32 3 
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll b/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
new file mode 100644
index 00000000000..c402805feb3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This test checks a bug in R600TargetLowering::LowerSELECT_CC where the
+; chance to optimize the fcmp + select instructions to SET* was missed
+; due to the fact that the operands to fcmp and select had different types
+
+; CHECK: SET{{[A-Z]+}}_DX10
+
+define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %0 = load float, float addrspace(1)* %in
+  %cmp = fcmp oeq float %0, 0.000000e+00
+  %value = select i1 %cmp, i32 -1, i32 0
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fcmp.ll b/test/CodeGen/AMDGPU/fcmp.ll
new file mode 100644
index 00000000000..5207ab57bad
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcmp.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: {{^}}fcmp_sext:
+; CHECK: SETE_DX10  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %0 = load float, float addrspace(1)* %in
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1
+  %1 = load float, float addrspace(1)* %arrayidx1
+  %cmp = fcmp oeq float %0, %1
+  %sext = sext i1 %cmp to i32
+  store i32 %sext, i32 addrspace(1)* %out
+  ret void
+}
+
+; This test checks that a setcc node with f32 operands is lowered to a
+; SET*_DX10 instruction.  Previously we were lowering this to:
+; SET* + FP_TO_SINT
+
+; CHECK: {{^}}fcmp_br:
+; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}}
+; CHECK-NEXT {{[0-9]+(5.0}}
+
+define void @fcmp_br(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp oeq float %in, 5.0
+  br i1 %0, label %IF, label %ENDIF
+
+IF:
+  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  store i32 0, i32 addrspace(1)* %1
+  br label %ENDIF
+
+ENDIF:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fcmp64.ll b/test/CodeGen/AMDGPU/fcmp64.ll
new file mode 100644
index 00000000000..053ab0ed7aa
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcmp64.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: {{^}}flt_f64:
+; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fcmp ult double %r0, %r1
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
+   ret void
+}
+
+; CHECK-LABEL: {{^}}fle_f64:
+; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fcmp ule double %r0, %r1
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
+   ret void
+}
+
+; CHECK-LABEL: {{^}}fgt_f64:
+; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fcmp ugt double %r0, %r1
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
+   ret void
+}
+
+; CHECK-LABEL: {{^}}fge_f64:
+; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fcmp uge double %r0, %r1
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
+   ret void
+}
+
+; CHECK-LABEL: {{^}}fne_f64:
+; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fcmp une double %r0, %r1
+   %r3 = select i1 %r2, double %r0, double %r1
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+; CHECK-LABEL: {{^}}feq_f64:
+; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fcmp ueq double %r0, %r1
+   %r3 = select i1 %r2, double %r0, double %r1
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/AMDGPU/fconst64.ll b/test/CodeGen/AMDGPU/fconst64.ll
new file mode 100644
index 00000000000..89af37545c9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fconst64.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK: {{^}}fconst_f64:
+; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000
+; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
+
+define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+   %r1 = load double, double addrspace(1)* %in
+   %r2 = fadd double %r1, 5.000000e+00
+   store double %r2, double addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/AMDGPU/fcopysign.f32.ll b/test/CodeGen/AMDGPU/fcopysign.f32.ll
new file mode 100644
index 00000000000..b719d5a3978
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+declare float @llvm.copysign.f32(float, float) nounwind readnone
+declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+; Try to identify arg based on higher address.
+; FUNC-LABEL: {{^}}test_copysign_f32:
+; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb
+; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc
+; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c
+; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30
+; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
+; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
+; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BFI_INT
+define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
+  %result = call float @llvm.copysign.f32(float %mag, float %sign)
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copysign_v2f32:
+; GCN: s_endpgm
+
+; EG: BFI_INT
+; EG: BFI_INT
+define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
+  %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
+  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copysign_v4f32:
+; GCN: s_endpgm
+
+; EG: BFI_INT
+; EG: BFI_INT
+; EG: BFI_INT
+; EG: BFI_INT
+define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
+  %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/fcopysign.f64.ll b/test/CodeGen/AMDGPU/fcopysign.f64.ll
new file mode 100644
index 00000000000..3d8c5599308
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+
+declare double @llvm.copysign.f64(double, double) nounwind readnone
+declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_copysign_f64:
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
+; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
+; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
+; GCN: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
+; GCN: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
+; GCN: s_endpgm
+define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
+  %result = call double @llvm.copysign.f64(double %mag, double %sign)
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copysign_v2f64:
+; GCN: s_endpgm
+define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
+  %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
+  store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copysign_v4f64:
+; GCN: s_endpgm
+define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
+  %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
+  store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fdiv.f64.ll b/test/CodeGen/AMDGPU/fdiv.f64.ll
new file mode 100644
index 00000000000..7c022e38c80
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -0,0 +1,96 @@
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
+
+
+; COMMON-LABEL: {{^}}fdiv_f64:
+; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0
+; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]]
+; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]]
+
+; Check for div_scale bug workaround on SI
+; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]]
+; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[NUM]], [[DEN]], [[NUM]]
+
+; COMMON-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]]
+
+; SI-DAG: v_cmp_eq_i32_e32 vcc, {{v[0-9]+}}, {{v[0-9]+}}
+; SI-DAG: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI-DAG: s_xor_b64 vcc, [[CMP0]], vcc
+
+; COMMON-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0
+; COMMON-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]]
+; COMMON-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0
+; COMMON-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]]
+; COMMON-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]]
+; COMMON-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]]
+; COMMON: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]]
+; COMMON: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]]
+; COMMON: buffer_store_dwordx2 [[RESULT]]
+; COMMON: s_endpgm
+define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind {
+  %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1
+  %num = load double, double addrspace(1)* %in
+  %den = load double, double addrspace(1)* %gep.1
+  %result = fdiv double %num, %den
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}fdiv_f64_s_v:
+define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) nounwind {
+  %den = load double, double addrspace(1)* %in
+  %result = fdiv double %num, %den
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}fdiv_f64_v_s:
+define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) nounwind {
+  %num = load double, double addrspace(1)* %in
+  %result = fdiv double %num, %den
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}fdiv_f64_s_s:
+define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) nounwind {
+  %result = fdiv double %num, %den
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}v_fdiv_v2f64:
+define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) nounwind {
+  %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1
+  %num = load <2 x double>, <2 x double> addrspace(1)* %in
+  %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1
+  %result = fdiv <2 x double> %num, %den
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}s_fdiv_v2f64:
+define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) {
+  %result = fdiv <2 x double> %num, %den
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}v_fdiv_v4f64:
+define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) nounwind {
+  %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
+  %num = load <4 x double>, <4 x double> addrspace(1)* %in
+  %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1
+  %result = fdiv <4 x double> %num, %den
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}s_fdiv_v4f64:
+define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) {
+  %result = fdiv <4 x double> %num, %den
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll
new file mode 100644
index 00000000000..7cbf8733639
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fdiv.ll
@@ -0,0 +1,68 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; These tests check that fdiv is expanded correctly and also test that the
+; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
+; instruction groups.
+
+; FUNC-LABEL: {{^}}fdiv_f32:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fdiv float %a, %b
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+
+
+; FUNC-LABEL: {{^}}fdiv_v2f32:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fdiv <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fdiv_v4f32:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float>, <4 x float> addrspace(1) * %in
+  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
+  %result = fdiv <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fetch-limits.r600.ll b/test/CodeGen/AMDGPU/fetch-limits.r600.ll
new file mode 100644
index 00000000000..e7160ef5d72
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fetch-limits.r600.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -march=r600 -mcpu=r600 | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=rs880 | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=rv670 | FileCheck %s
+
+; R600 supports 8 fetches in a clause
+; CHECK: {{^}}fetch_limits_r600:
+; CHECK: Fetch clause
+; CHECK: Fetch clause
+
+define void @fetch_limits_r600() #0 {
+entry:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
+  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
+  %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
+  %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
+  %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1)
+  %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1)
+  %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1)
+  %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1)
+  %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1)
+  %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
+  %a = fadd <4 x float> %res0, %res1
+  %b = fadd <4 x float> %res2, %res3
+  %c = fadd <4 x float> %res4, %res5
+  %d = fadd <4 x float> %res6, %res7
+  %e = fadd <4 x float> %res8, %a
+
+  %bc = fadd <4 x float> %b, %c
+  %de = fadd <4 x float> %d, %e
+
+  %bcde = fadd <4 x float> %bc, %de
+
+  call void @llvm.R600.store.swizzle(<4 x float> %bcde, i32 0, i32 1)
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" } ; Pixel Shader
+
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/fetch-limits.r700+.ll b/test/CodeGen/AMDGPU/fetch-limits.r700+.ll
new file mode 100644
index 00000000000..acaea2aa794
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fetch-limits.r700+.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s -march=r600 -mcpu=rv710 | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=rv730 | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=rv770 | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=sumo | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=juniper | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=barts | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=turks | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=caicos | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
+
+; r700+ supports 16 fetches in a clause
+; CHECK: {{^}}fetch_limits_r700:
+; CHECK: Fetch clause
+; CHECK: Fetch clause
+
+define void @fetch_limits_r700() #0 {
+entry:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
+  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %9 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %11 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
+  %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13)
+  %14 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %15 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
+  %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
+  %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
+  %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
+  %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1)
+  %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1)
+  %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1)
+  %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1)
+  %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1)
+  %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
+  %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %9, i32 0, i32 0, i32 1)
+  %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %10, i32 0, i32 0, i32 1)
+  %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %11, i32 0, i32 0, i32 1)
+  %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %12, i32 0, i32 0, i32 1)
+  %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %13, i32 0, i32 0, i32 1)
+  %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %14, i32 0, i32 0, i32 1)
+  %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %15, i32 0, i32 0, i32 1)
+  %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %16, i32 0, i32 0, i32 1)
+  %a = fadd <4 x float> %res0, %res1
+  %b = fadd <4 x float> %res2, %res3
+  %c = fadd <4 x float> %res4, %res5
+  %d = fadd <4 x float> %res6, %res7
+  %e = fadd <4 x float> %res8, %res9
+  %f = fadd <4 x float> %res10, %res11
+  %g = fadd <4 x float> %res12, %res13
+  %h = fadd <4 x float> %res14, %res15
+  %i = fadd <4 x float> %res16, %a
+
+  %bc = fadd <4 x float> %b, %c
+  %de = fadd <4 x float> %d, %e
+  %fg = fadd <4 x float> %f, %g
+  %hi = fadd <4 x float> %h, %i
+
+  %bcde = fadd <4 x float> %bc, %de
+  %fghi = fadd <4 x float> %fg, %hi
+
+  %bcdefghi = fadd <4 x float> %bcde, %fghi
+  call void @llvm.R600.store.swizzle(<4 x float> %bcdefghi, i32 0, i32 1)
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" } ; Pixel Shader
+
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/ffloor.f64.ll b/test/CodeGen/AMDGPU/ffloor.f64.ll
new file mode 100644
index 00000000000..45f8382c392
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ffloor.f64.ll
@@ -0,0 +1,127 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+declare double @llvm.fabs.f64(double %Val)
+declare double @llvm.floor.f64(double) nounwind readnone
+declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
+
+; FUNC-LABEL: {{^}}ffloor_f64:
+; CI: v_floor_f64_e32
+; SI: v_fract_f64_e32
+; SI: v_min_f64
+; SI: v_cmp_class_f64_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_add_f64
+; SI: s_endpgm
+define void @ffloor_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.floor.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_f64_neg:
+; CI: v_floor_f64_e64
+; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT:s[[0-9]+:[0-9]+]]]
+; SI: v_min_f64
+; SI: v_cmp_class_f64_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]]
+; SI: s_endpgm
+define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
+  %neg = fsub double 0.0, %x
+  %y = call double @llvm.floor.f64(double %neg) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_f64_neg_abs:
+; CI: v_floor_f64_e64
+; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT:s[[0-9]+:[0-9]+]]]|
+; SI: v_min_f64
+; SI: v_cmp_class_f64_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]|
+; SI: s_endpgm
+define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
+  %abs = call double @llvm.fabs.f64(double %x)
+  %neg = fsub double 0.0, %abs
+  %y = call double @llvm.floor.f64(double %neg) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_v2f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64:
+; FIXME-CI: v_floor_f64_e32
+; FIXME-CI: v_floor_f64_e32
+; FIXME-CI: v_floor_f64_e32
+; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}ffloor_v4f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_v8f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_v16f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ffloor.ll b/test/CodeGen/AMDGPU/ffloor.ll
new file mode 100644
index 00000000000..61c46ac2bc0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ffloor.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}floor_f32:
+; SI: v_floor_f32_e32
+; R600: FLOOR
+define void @floor_f32(float addrspace(1)* %out, float %in) {
+  %tmp = call float @llvm.floor.f32(float %in) #0
+  store float %tmp, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}floor_v2f32:
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
+
+define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+  %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0
+  store <2 x float> %tmp, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}floor_v4f32:
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
+
+; R600: FLOOR
+; R600: FLOOR
+; R600: FLOOR
+; R600: FLOOR
+define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+  %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0
+  store <4 x float> %tmp, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare float @llvm.floor.f32(float) #0
+
+; Function Attrs: nounwind readonly
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) #0
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll
new file mode 100644
index 00000000000..8ceca078f2d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -0,0 +1,184 @@
+; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+
+; Disable optimizations in case there are optimizations added that
+; specialize away generic pointer accesses.
+
+
+; CHECK-LABEL: {{^}}branch_use_flat_i32:
+; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: s_endpgm
+define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+entry:
+  %cmp = icmp ne i32 %c, 0
+  br i1 %cmp, label %local, label %global
+
+local:
+  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
+  br label %end
+
+global:
+  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  br label %end
+
+end:
+  %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+;  %val = load i32, i32 addrspace(4)* %fptr, align 4
+;  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+
+; These testcases might become useless when there are optimizations to
+; remove generic pointers.
+
+; CHECK-LABEL: {{^}}store_flat_i32:
+; CHECK: v_mov_b32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}}
+; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}}
+; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}}
+; CHECK: flat_store_dword v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_i64:
+; CHECK: flat_store_dwordx2
+define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
+  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
+  store i64 %x, i64 addrspace(4)* %fptr, align 8
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_v4i32:
+; CHECK: flat_store_dwordx4
+define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
+  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
+  store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_trunc_i16:
+; CHECK: flat_store_short
+define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %y = trunc i32 %x to i16
+  store i16 %y, i16 addrspace(4)* %fptr, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_trunc_i8:
+; CHECK: flat_store_byte
+define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %y = trunc i32 %x to i8
+  store i8 %y, i8 addrspace(4)* %fptr, align 2
+  ret void
+}
+
+
+
+; CHECK-LABEL @load_flat_i32:
+; CHECK: flat_load_dword
+define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  %fload = load i32, i32 addrspace(4)* %fptr, align 4
+  store i32 %fload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @load_flat_i64:
+; CHECK: flat_load_dwordx2
+define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
+  %fload = load i64, i64 addrspace(4)* %fptr, align 4
+  store i64 %fload, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; CHECK-LABEL @load_flat_v4i32:
+; CHECK: flat_load_dwordx4
+define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
+  %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 4
+  store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; CHECK-LABEL @sextload_flat_i8:
+; CHECK: flat_load_sbyte
+define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %fload = load i8, i8 addrspace(4)* %fptr, align 4
+  %ext = sext i8 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @zextload_flat_i8:
+; CHECK: flat_load_ubyte
+define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %fload = load i8, i8 addrspace(4)* %fptr, align 4
+  %ext = zext i8 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @sextload_flat_i16:
+; CHECK: flat_load_sshort
+define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %fload = load i16, i16 addrspace(4)* %fptr, align 4
+  %ext = sext i16 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @zextload_flat_i16:
+; CHECK: flat_load_ushort
+define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %fload = load i16, i16 addrspace(4)* %fptr, align 4
+  %ext = zext i16 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+
+; TODO: This should not be zero when registers are used for small
+; scratch allocations again.
+
+; Check for prologue initializing special SGPRs pointing to scratch.
+; CHECK-LABEL: {{^}}store_flat_scratch:
+; CHECK: s_movk_i32 flat_scratch_lo, 0
+; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
+; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
+; CHECK: flat_store_dword
+; CHECK: s_barrier
+; CHECK: flat_load_dword
+define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
+  %alloca = alloca i32, i32 9, align 4
+  %x = call i32 @llvm.r600.read.tidig.x() #3
+  %pptr = getelementptr i32, i32* %alloca, i32 %x
+  %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr
+  ; Dummy call
+  call void @llvm.AMDGPU.barrier.local() #1
+  %reload = load i32, i32 addrspace(4)* %fptr, align 4
+  store i32 %reload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local() #1
+declare i32 @llvm.r600.read.tidig.x() #3
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind noduplicate }
+attributes #3 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/floor.ll b/test/CodeGen/AMDGPU/floor.ll
new file mode 100644
index 00000000000..c6bfb8567a0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/floor.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s
+
+; CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = call float @floor(float %r0)
+   %vec = insertelement <4 x float> undef, float %r1, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare float @floor(float) readonly
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll
new file mode 100644
index 00000000000..bd574b87711
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fma-combine.ll
@@ -0,0 +1,368 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare double @llvm.fabs.f64(double) #0
+declare double @llvm.fma.f64(double, double, double) #0
+declare float @llvm.fma.f32(float, float, float) #0
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fadd double %mul, %c
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %fma0 = fadd double %mul, %c
+  %fma1 = fadd double %mul, %d
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fadd x, (fmul y, z)) -> (fma y, z, x)
+; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fadd double %c, %mul
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fsub double %mul, %c
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %fma0 = fsub double %mul, %c
+  %fma1 = fsub double %mul, %d
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fsub double %c, %mul
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %fma0 = fsub double %c, %mul
+  %fma1 = fsub double %d, %mul
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %mul.neg = fsub double -0.0, %mul
+  %fma = fsub double %mul.neg, %c
+
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %mul.neg = fsub double -0.0, %mul
+  %fma0 = fsub double %mul.neg, %c
+  %fma1 = fsub double %mul.neg, %d
+
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %mul.neg = fsub double -0.0, %mul
+  %fma0 = fsub double %mul.neg, %c
+  %fma1 = fsub double %mul, %d
+
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
+; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
+; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %x = load double, double addrspace(1)* %gep.0
+  %y = load double, double addrspace(1)* %gep.1
+  %z = load double, double addrspace(1)* %gep.2
+  %u = load double, double addrspace(1)* %gep.3
+  %v = load double, double addrspace(1)* %gep.4
+
+  %tmp0 = fmul double %u, %v
+  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
+  %tmp2 = fsub double %tmp1, %z
+
+  store double %tmp2, double addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub x, (fma y, z, (fmul u, v)))
+;   -> (fma (fneg y), z, (fma (fneg u), v, x))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
+; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
+; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %x = load double, double addrspace(1)* %gep.0
+  %y = load double, double addrspace(1)* %gep.1
+  %z = load double, double addrspace(1)* %gep.2
+  %u = load double, double addrspace(1)* %gep.3
+  %v = load double, double addrspace(1)* %gep.4
+
+  %tmp0 = fmul double %u, %v
+  %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
+  %tmp2 = fsub double %x, %tmp1
+
+  store double %tmp2, double addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/fma.f64.ll b/test/CodeGen/AMDGPU/fma.f64.ll
new file mode 100644
index 00000000000..0a55ef77855
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fma.f64.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+
+
+; FUNC-LABEL: {{^}}fma_f64:
+; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2, double addrspace(1)* %in3) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = load double, double addrspace(1)* %in3
+   %r3 = tail call double @llvm.fma.f64(double %r0, double %r1, double %r2)
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fma_v2f64:
+; SI: v_fma_f64
+; SI: v_fma_f64
+define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+                       <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
+   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
+   %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
+   %r2 = load <2 x double>, <2 x double> addrspace(1)* %in3
+   %r3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2)
+   store <2 x double> %r3, <2 x double> addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fma_v4f64:
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+                       <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
+   %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
+   %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
+   %r2 = load <4 x double>, <4 x double> addrspace(1)* %in3
+   %r3 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %r0, <4 x double> %r1, <4 x double> %r2)
+   store <4 x double> %r3, <4 x double> addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/AMDGPU/fma.ll b/test/CodeGen/AMDGPU/fma.ll
new file mode 100644
index 00000000000..d6024aa0b4c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fma.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: {{^}}fma_f32:
+; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+
+; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}},
+; EG: FMA {{\*? *}}[[RES]]
+define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                     float addrspace(1)* %in2, float addrspace(1)* %in3) {
+  %r0 = load float, float addrspace(1)* %in1
+  %r1 = load float, float addrspace(1)* %in2
+  %r2 = load float, float addrspace(1)* %in3
+  %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
+  store float %r3, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fma_v2f32:
+; SI: v_fma_f32
+; SI: v_fma_f32
+
+; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}},
+; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]]
+; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
+define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+                       <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
+  %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1
+  %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2
+  %r2 = load <2 x float>, <2 x float> addrspace(1)* %in3
+  %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
+  store <2 x float> %r3, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fma_v4f32:
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+
+; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}},
+; EG-DAG: FMA {{\*? *}}[[RES]].X
+; EG-DAG: FMA {{\*? *}}[[RES]].Y
+; EG-DAG: FMA {{\*? *}}[[RES]].Z
+; EG-DAG: FMA {{\*? *}}[[RES]].W
+define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+                       <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
+  %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1
+  %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2
+  %r2 = load <4 x float>, <4 x float> addrspace(1)* %in3
+  %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
+  store <4 x float> %r3, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
+; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}}
+define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %fma = call float @llvm.fma.f32(float %a, float 2.0, float %b)
+  store float %fma, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fma_commute_mul_s_f32
+define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %c = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+  store float %fma, float addrspace(1)* %out.gep, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fmad.ll b/test/CodeGen/AMDGPU/fmad.ll
new file mode 100644
index 00000000000..935e35123f4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmad.ll
@@ -0,0 +1,19 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = extractelement <4 x float> %reg0, i32 2
+   %r3 = fmul float %r0, %r1
+   %r4 = fadd float %r3, %r2
+   %vec = insertelement <4 x float> undef, float %r4, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare float @fabs(float ) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/fmax.ll b/test/CodeGen/AMDGPU/fmax.ll
new file mode 100644
index 00000000000..d7127f485c7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmax.ll
@@ -0,0 +1,17 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = fcmp oge float %r0, %r1
+   %r3 = select i1 %r2, float %r0, float %r1
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/fmax3.f64.ll b/test/CodeGen/AMDGPU/fmax3.f64.ll
new file mode 100644
index 00000000000..f78c71b2826
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmax3.f64.ll
@@ -0,0 +1,24 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare double @llvm.maxnum.f64(double, double) nounwind readnone
+
+; SI-LABEL: {{^}}test_fmax3_f64:
+; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0{{$}}
+; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:8
+; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:16
+; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]]
+; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
+define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
+  %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1
+  %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2
+  %a = load double, double addrspace(1)* %aptr, align 8
+  %b = load double, double addrspace(1)* %bptr, align 8
+  %c = load double, double addrspace(1)* %cptr, align 8
+  %f0 = call double @llvm.maxnum.f64(double %a, double %b) nounwind readnone
+  %f1 = call double @llvm.maxnum.f64(double %f0, double %c) nounwind readnone
+  store double %f1, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll
new file mode 100644
index 00000000000..c3028a6217d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmax3.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.maxnum.f32(float, float) nounwind readnone
+
+; SI-LABEL: {{^}}test_fmax3_olt_0:
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float, float addrspace(1)* %aptr, align 4
+  %b = load float, float addrspace(1)* %bptr, align 4
+  %c = load float, float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Commute operand of second fmax
+; SI-LABEL: {{^}}test_fmax3_olt_1:
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float, float addrspace(1)* %aptr, align 4
+  %b = load float, float addrspace(1)* %bptr, align 4
+  %c = load float, float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
new file mode 100644
index 00000000000..828243888ac
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; Make sure we don't try to form FMAX_LEGACY nodes with f64
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmax_legacy_uge_f64
+define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp uge double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_oge_f64
+define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp oge double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ugt_f64
+define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ugt double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ogt_f64
+define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ogt double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll
new file mode 100644
index 00000000000..413957d2982
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -0,0 +1,116 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FIXME: Should replace unsafe-fp-math with no signed zeros.
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmax_legacy_uge_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; EG: MAX
+define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp uge float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_oge_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; EG: MAX
+define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp oge float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ugt_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; EG: MAX
+define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ugt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ogt_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; EG: MAX
+define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ogt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; FUNC-LABEL: @test_fmax_legacy_ogt_f32_multi_use
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-NOT: v_max_
+; SI: v_cmp_gt_f32
+; SI-NEXT: v_cndmask_b32
+; SI-NOT: v_max_
+
+; EG: MAX
+define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ogt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out0, align 4
+  store i1 %cmp, i1addrspace(1)* %out1
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmaxnum.f64.ll b/test/CodeGen/AMDGPU/fmaxnum.f64.ll
new file mode 100644
index 00000000000..de563cec341
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmaxnum.f64.ll
@@ -0,0 +1,76 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.maxnum.f64(double, double) #0
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) #0
+declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) #0
+declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>) #0
+declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>) #0
+
+; FUNC-LABEL: @test_fmax_f64
+; SI: v_max_f64
+define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+  %val = call double @llvm.maxnum.f64(double %a, double %b) #0
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v2f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0
+  store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v4f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0
+  store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v8f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0
+  store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v16f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0
+  store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmaxnum.ll b/test/CodeGen/AMDGPU/fmaxnum.ll
new file mode 100644
index 00000000000..3029bd02e4d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmaxnum.ll
@@ -0,0 +1,283 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.maxnum.f32(float, float) #0
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0
+declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0
+declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0
+
+declare double @llvm.maxnum.f64(double, double)
+
+; FUNC-LABEL: @test_fmax_f32
+; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %val = call float @llvm.maxnum.f32(float %a, float %b) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v2f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+  %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0
+  store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v4f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+  %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0
+  store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v8f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
+define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+  %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0
+  store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v16f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]]
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W
+define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+  %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0
+  store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_nan_nan
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+; EG: 2143289344(nan)
+define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_val_nan
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_nan_val
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_p0_p0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_p0_n0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_n0_p0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_n0_n0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_var_immediate_f32
+; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_immediate_var_f32
+; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
+define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_var_literal_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
+define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_literal_var_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
+define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmin.ll b/test/CodeGen/AMDGPU/fmin.ll
new file mode 100644
index 00000000000..defa8c09638
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmin.ll
@@ -0,0 +1,17 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = fcmp uge float %r0, %r1
+   %r3 = select i1 %r2, float %r1, float %r0
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll
new file mode 100644
index 00000000000..0a76699b43e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmin3.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.minnum.f32(float, float) nounwind readnone
+
+; SI-LABEL: {{^}}test_fmin3_olt_0:
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float, float addrspace(1)* %aptr, align 4
+  %b = load float, float addrspace(1)* %bptr, align 4
+  %c = load float, float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Commute operand of second fmin
+; SI-LABEL: {{^}}test_fmin3_olt_1:
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float, float addrspace(1)* %aptr, align 4
+  %b = load float, float addrspace(1)* %bptr, align 4
+  %c = load float, float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
new file mode 100644
index 00000000000..e19a48f3f7e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
@@ -0,0 +1,77 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmin_legacy_f64
+define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
+   %r0 = extractelement <4 x double> %reg0, i32 0
+   %r1 = extractelement <4 x double> %reg0, i32 1
+   %r2 = fcmp uge double %r0, %r1
+   %r3 = select i1 %r2, double %r1, double %r0
+   %vec = insertelement <4 x double> undef, double %r3, i32 0
+   store <4 x double> %vec, <4 x double> addrspace(1)* %out, align 16
+   ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ule_f64
+define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ule double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ole_f64
+define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ole double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_olt_f64
+define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp olt double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ult_f64
+define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ult double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll
new file mode 100644
index 00000000000..6a625c239d7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -0,0 +1,123 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math  -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FIXME: Should replace unsafe-fp-math with no signed zeros.
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmin_legacy_f32
+; EG: MIN *
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-NONAN: v_min_f32_e32
+define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = fcmp uge float %r0, %r1
+   %r3 = select i1 %r2, float %r1, float %r0
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   store <4 x float> %vec, <4 x float> addrspace(1)* %out, align 16
+   ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ule_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ule float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ole_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ole float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_olt_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp olt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ult_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ult float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ole_f32_multi_use
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-NOT: v_min
+; SI: v_cmp_le_f32
+; SI-NEXT: v_cndmask_b32
+; SI-NOT: v_min
+; SI: s_endpgm
+define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ole float %a, %b
+  %val0 = select i1 %cmp, float %a, float %b
+  store float %val0, float addrspace(1)* %out0, align 4
+  store i1 %cmp, i1 addrspace(1)* %out1
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fminnum.f64.ll b/test/CodeGen/AMDGPU/fminnum.f64.ll
new file mode 100644
index 00000000000..0f929d6a81f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fminnum.f64.ll
@@ -0,0 +1,76 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.minnum.f64(double, double) #0
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) #0
+declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) #0
+declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0
+declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0
+
+; FUNC-LABEL: @test_fmin_f64
+; SI: v_min_f64
+define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+  %val = call double @llvm.minnum.f64(double %a, double %b) #0
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v2f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0
+  store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v4f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0
+  store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v8f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0
+  store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v16f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0
+  store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fminnum.ll b/test/CodeGen/AMDGPU/fminnum.ll
new file mode 100644
index 00000000000..4d7b52540d8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fminnum.ll
@@ -0,0 +1,281 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.minnum.f32(float, float) #0
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0
+declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0
+declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0
+
+; FUNC-LABEL: @test_fmin_f32
+; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %val = call float @llvm.minnum.f32(float %a, float %b) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v2f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+  %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0
+  store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v4f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+  %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0
+  store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v8f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
+define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+  %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0
+  store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v16f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]]
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W
+define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+  %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0
+  store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_nan_nan
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+; EG: 2143289344({{nan|1\.#QNAN0e\+00}})
+define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_val_nan
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_nan_val
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_p0_p0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_p0_n0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_n0_p0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_n0_n0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_var_immediate_f32
+; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
+define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float %a, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_immediate_var_f32
+; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
+define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float 2.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_var_literal_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
+define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float %a, float 99.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_literal_var_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
+define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float 99.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmul.ll b/test/CodeGen/AMDGPU/fmul.ll
new file mode 100644
index 00000000000..addc409c9eb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmul.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}fmul_f32:
+; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
+
+; SI: v_mul_f32
+define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fmul float %a, %b
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+; FUNC-LABEL: {{^}}fmul_v2f32:
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
+
+; SI: v_mul_f32
+; SI: v_mul_f32
+define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fmul <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fmul_v4f32:
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_mul_f32
+; SI: v_mul_f32
+; SI: v_mul_f32
+; SI: v_mul_f32
+define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float>, <4 x float> addrspace(1) * %in
+  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
+  %result = fmul <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_mul_2_k:
+; SI: v_mul_f32
+; SI-NOT: v_mul_f32
+; SI: s_endpgm
+define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
+  %y = fmul float %x, 2.0
+  %z = fmul float %y, 3.0
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_mul_2_k_inv:
+; SI: v_mul_f32
+; SI-NOT: v_mul_f32
+; SI-NOT: v_mad_f32
+; SI: s_endpgm
+define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
+  %y = fmul float %x, 3.0
+  %z = fmul float %y, 2.0
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; There should be three multiplies here; %a should be used twice (once
+; negated), not duplicated into mul x, 5.0 and mul x, -5.0.
+; FUNC-LABEL: {{^}}test_mul_twouse:
+; SI: v_mul_f32
+; SI: v_mul_f32
+; SI: v_mul_f32
+; SI-NOT: v_mul_f32
+define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
+  %a = fmul float %x, 5.0
+  %b = fsub float -0.0, %a
+  %c = fmul float %b, %y
+  %d = fmul float %c, %a
+  store float %d, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fmul64.ll b/test/CodeGen/AMDGPU/fmul64.ll
new file mode 100644
index 00000000000..3c222eaba89
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmul64.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+
+; FUNC-LABEL: {{^}}fmul_f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                      double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fmul double %r0, %r1
+   store double %r2, double addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fmul_v2f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+                        <2 x double> addrspace(1)* %in2) {
+   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
+   %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
+   %r2 = fmul <2 x double> %r0, %r1
+   store <2 x double> %r2, <2 x double> addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fmul_v4f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+                        <4 x double> addrspace(1)* %in2) {
+   %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
+   %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
+   %r2 = fmul <4 x double> %r0, %r1
+   store <4 x double> %r2, <4 x double> addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/AMDGPU/fmuladd.ll b/test/CodeGen/AMDGPU/fmuladd.ll
new file mode 100644
index 00000000000..ae84d841021
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmuladd.ll
@@ -0,0 +1,199 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare double @llvm.fmuladd.f64(double, double, double)
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; CHECK-LABEL: {{^}}fmuladd_f32:
+; CHECK: v_mad_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                         float addrspace(1)* %in2, float addrspace(1)* %in3) {
+   %r0 = load float, float addrspace(1)* %in1
+   %r1 = load float, float addrspace(1)* %in2
+   %r2 = load float, float addrspace(1)* %in3
+   %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2)
+   store float %r3, float addrspace(1)* %out
+   ret void
+}
+
+; CHECK-LABEL: {{^}}fmuladd_f64:
+; CHECK: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+
+define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                         double addrspace(1)* %in2, double addrspace(1)* %in3) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = load double, double addrspace(1)* %in3
+   %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2)
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+; CHECK-LABEL: {{^}}fmuladd_2.0_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fmuladd_a_2.0_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fadd_a_a_b_f32:
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fadd_a_a_b_f32(float addrspace(1)* %out,
+                            float addrspace(1)* %in1,
+                            float addrspace(1)* %in2) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r0 = load float, float addrspace(1)* %gep.0
+  %r1 = load float, float addrspace(1)* %gep.1
+
+  %add.0 = fadd float %r0, %r0
+  %add.1 = fadd float %add.0, %r1
+  store float %add.1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fadd_b_a_a_f32:
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fadd_b_a_a_f32(float addrspace(1)* %out,
+                            float addrspace(1)* %in1,
+                            float addrspace(1)* %in2) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r0 = load float, float addrspace(1)* %gep.0
+  %r1 = load float, float addrspace(1)* %gep.1
+
+  %add.0 = fadd float %r0, %r0
+  %add.1 = fadd float %r1, %add.0
+  store float %add.1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %r1.fneg = fsub float -0.000000e+00, %r1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %r1.fneg = fsub float -0.000000e+00, %r1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %r2.fneg = fsub float -0.000000e+00, %r2
+
+  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fnearbyint.ll b/test/CodeGen/AMDGPU/fnearbyint.ll
new file mode 100644
index 00000000000..4fa9adaabda
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -0,0 +1,58 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s
+
+; This should have the exactly the same output as the test for rint,
+; so no need to check anything.
+
+declare float @llvm.nearbyint.f32(float) #0
+declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #0
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #0
+declare double @llvm.nearbyint.f64(double) #0
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0
+declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0
+
+
+define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 {
+entry:
+  %0 = call float @llvm.nearbyint.f32(float %in)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+entry:
+  %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
+entry:
+  %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define void @nearbyint_f64(double addrspace(1)* %out, double %in) {
+entry:
+  %0 = call double @llvm.nearbyint.f64(double %in)
+  store double %0, double addrspace(1)* %out
+  ret void
+}
+define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+entry:
+  %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in)
+  store <2 x double> %0, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+entry:
+  %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in)
+  store <4 x double> %0, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
new file mode 100644
index 00000000000..8830e827366
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -0,0 +1,100 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FIXME: Check something here. Currently it seems fabs + fneg aren't
+; into 2 modifiers, although theoretically that should work.
+
+; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
+define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
+  %fabs = call double @llvm.fabs.f64(double %x)
+  %fsub = fsub double -0.000000e+00, %fabs
+  %fadd = fadd double %y, %fsub
+  store double %fadd, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) {
+  %x = load double, double addrspace(1)* %xptr, align 8
+  %y = load double, double addrspace(1)* %xptr, align 8
+  %fabs = call double @llvm.fabs.f64(double %x)
+  %fsub = fsub double -0.000000e+00, %fabs
+  %fadd = fadd double %y, %fsub
+  store double %fadd, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fmul_f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|
+define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
+  %fabs = call double @llvm.fabs.f64(double %x)
+  %fsub = fsub double -0.000000e+00, %fabs
+  %fmul = fmul double %y, %fsub
+  store double %fmul, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_free_f64:
+define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc = bitcast i64 %in to double
+  %fabs = call double @llvm.fabs.f64(double %bc)
+  %fsub = fsub double -0.000000e+00, %fabs
+  store double %fsub, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f64:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc = bitcast i64 %in to double
+  %fabs = call double @fabs(double %bc)
+  %fsub = fsub double -0.000000e+00, %fabs
+  store double %fsub, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_f64:
+; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
+; SI: s_load_dwordx2
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
+; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
+define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
+  %fabs = call double @llvm.fabs.f64(double %in)
+  %fsub = fsub double -0.000000e+00, %fabs
+  store double %fsub, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_v2f64:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+  %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
+  %fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
+  store <2 x double> %fsub, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_v4f64:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+  %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
+  %fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs
+  store <4 x double> %fsub, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+declare double @fabs(double) readnone
+declare double @llvm.fabs.f64(double) readnone
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone
+declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll
new file mode 100644
index 00000000000..3b4930d9897
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -0,0 +1,118 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
+; SI-NOT: and
+; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
+define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %fsub = fsub float -0.000000e+00, %fabs
+  %fadd = fadd float %y, %fsub
+  store float %fadd, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32:
+; SI-NOT: and
+; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}|
+; SI-NOT: and
+define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %fsub = fsub float -0.000000e+00, %fabs
+  %fmul = fmul float %y, %fsub
+  store float %fmul, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; DAGCombiner will transform:
+; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
+; unless isFabsFree returns true
+
+; FUNC-LABEL: {{^}}fneg_fabs_free_f32:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+; R600: -PV
+
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
+  %bc = bitcast i32 %in to float
+  %fabs = call float @llvm.fabs.f32(float %bc)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f32:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+; R600: -PV
+
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
+  %bc = bitcast i32 %in to float
+  %fabs = call float @fabs(float %bc)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_f32:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
+  %fabs = call float @llvm.fabs.f32(float %in)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_fneg_fabs_f32:
+; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
+define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %val = load float, float addrspace(1)* %in, align 4
+  %fabs = call float @llvm.fabs.f32(float %val)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_v2f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: -PV
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: -PV
+
+; FIXME: SGPR should be used directly for first src operand.
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
+  store <2 x float> %fsub, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: SGPR should be used directly for first src operand.
+; FUNC-LABEL: {{^}}fneg_fabs_v4f32:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+  %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+  %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
+  store <4 x float> %fsub, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @fabs(float) readnone
+declare float @llvm.fabs.f32(float) readnone
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/AMDGPU/fneg.f64.ll b/test/CodeGen/AMDGPU/fneg.f64.ll
new file mode 100644
index 00000000000..aa6df209035
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fneg.f64.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}fneg_f64:
+; GCN: v_xor_b32
+define void @fneg_f64(double addrspace(1)* %out, double %in) {
+  %fneg = fsub double -0.000000e+00, %in
+  store double %fneg, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_v2f64:
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
+  %fneg = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %in
+  store <2 x double> %fneg, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_v4f64:
+; R600: -PV
+; R600: -T
+; R600: -PV
+; R600: -PV
+
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
+  %fneg = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %in
+  store <4 x double> %fneg, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; DAGCombiner will transform:
+; (fneg (f64 bitcast (i64 a))) => (f64 bitcast (xor (i64 a), 0x80000000))
+; unless the target returns true for isNegFree()
+
+; FUNC-LABEL: {{^}}fneg_free_f64:
+; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, 0, -{{s\[[0-9]+:[0-9]+\]$}}
+define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc = bitcast i64 %in to double
+  %fsub = fsub double 0.0, %bc
+  store double %fsub, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_fold_f64:
+; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-NOT: xor
+; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
+define void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
+  %fsub = fsub double -0.0, %in
+  %fmul = fmul double %fsub, %in
+  store double %fmul, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fneg.ll b/test/CodeGen/AMDGPU/fneg.ll
new file mode 100644
index 00000000000..a0fd539863c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fneg.ll
@@ -0,0 +1,70 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}fneg_f32:
+; R600: -PV
+
+; GCN: v_xor_b32
+define void @fneg_f32(float addrspace(1)* %out, float %in) {
+  %fneg = fsub float -0.000000e+00, %in
+  store float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_v2f32:
+; R600: -PV
+; R600: -PV
+
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+define void @fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
+  %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
+  store <2 x float> %fneg, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_v4f32:
+; R600: -PV
+; R600: -T
+; R600: -PV
+; R600: -PV
+
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+define void @fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
+  %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
+  store <4 x float> %fneg, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; DAGCombiner will transform:
+; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000))
+; unless the target returns true for isNegFree()
+
+; FUNC-LABEL: {{^}}fneg_free_f32:
+; R600-NOT: XOR
+; R600: -KC0[2].Z
+
+; XXX: We could use v_add_f32_e64 with the negate bit here instead.
+; GCN: v_sub_f32_e64 v{{[0-9]}}, 0, s{{[0-9]+$}}
+define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
+  %bc = bitcast i32 %in to float
+  %fsub = fsub float 0.0, %bc
+  store float %fsub, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fold_f32:
+; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN-NOT: xor
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
+define void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
+  %fsub = fsub float -0.0, %in
+  %fmul = fmul float %fsub, %in
+  store float %fmul, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp-classify.ll b/test/CodeGen/AMDGPU/fp-classify.ll
new file mode 100644
index 00000000000..4fac5176fac
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp-classify.ll
@@ -0,0 +1,131 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i1 @llvm.AMDGPU.class.f32(float, i32) #1
+declare i1 @llvm.AMDGPU.class.f64(double, i32) #1
+declare i32 @llvm.r600.read.tidig.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare double @llvm.fabs.f64(double) #1
+
+; SI-LABEL: {{^}}test_isinf_pattern:
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x204{{$}}
+; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
+; SI-NOT: v_cmp
+; SI: s_endpgm
+define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %cmp = fcmp oeq float %fabs, 0x7FF0000000000000
+  %ext = zext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_not_isinf_pattern_0:
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %cmp = fcmp ueq float %fabs, 0x7FF0000000000000
+  %ext = zext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_not_isinf_pattern_1:
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %cmp = fcmp oeq float %fabs, 0xFFF0000000000000
+  %ext = zext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_isfinite_pattern_0:
+; SI-NOT: v_cmp
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1f8{{$}}
+; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
+; SI-NOT: v_cmp
+; SI: s_endpgm
+define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Use negative infinity
+; SI-LABEL: {{^}}test_isfinite_not_pattern_0:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %ninf = fcmp une float %x.fabs, 0xFFF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; No fabs
+; SI-LABEL: {{^}}test_isfinite_not_pattern_1:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %ninf = fcmp une float %x, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; fabs of different value
+; SI-LABEL: {{^}}test_isfinite_not_pattern_2:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %y) #1
+  %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Wrong ordered compare type
+; SI-LABEL: {{^}}test_isfinite_not_pattern_3:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp uno float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Wrong unordered compare
+; SI-LABEL: {{^}}test_isfinite_not_pattern_4:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %ninf = fcmp one float %x.fabs, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp.ll b/test/CodeGen/AMDGPU/fp16_to_fp.ll
new file mode 100644
index 00000000000..5a79ca82bc2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp16_to_fp.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+
+; SI-LABEL: {{^}}test_convert_fp16_to_fp32:
+; SI: buffer_load_ushort [[VAL:v[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16, i16 addrspace(1)* %in, align 2
+  %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; SI-LABEL: {{^}}test_convert_fp16_to_fp64:
+; SI: buffer_load_ushort [[VAL:v[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
+; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16, i16 addrspace(1)* %in, align 2
+  %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
+  store double %cvt, double addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
new file mode 100644
index 00000000000..67925ebd82b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
+
+; SI-LABEL: {{^}}test_convert_fp32_to_fp16:
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_short [[RESULT]]
+define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %val = load float, float addrspace(1)* %in, align 4
+  %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
+  store i16 %cvt, i16 addrspace(1)* %out, align 2
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
new file mode 100644
index 00000000000..12df6606e8f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
@@ -0,0 +1,56 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @fp_to_sint_f64_i32
+; SI: v_cvt_i32_f64_e32
+define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
+  %result = fptosi double %in to i32
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_sint_v2f64_v2i32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+  %result = fptosi <2 x double> %in to <2 x i32>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_sint_v4f64_v4i32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+  %result = fptosi <4 x double> %in to <4 x i32>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_sint_i64_f64
+; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]]
+; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}}
+; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000
+
+; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}}
+; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]]
+
+; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000
+
+; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]]
+; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
+; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
+; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %val = load double, double addrspace(1)* %gep, align 8
+  %cast = fptosi double %val to i64
+  store i64 %cast, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.ll b/test/CodeGen/AMDGPU/fp_to_sint.ll
new file mode 100644
index 00000000000..301a94b4904
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -0,0 +1,230 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+declare float @llvm.fabs.f32(float) #0
+
+; FUNC-LABEL: {{^}}fp_to_sint_i32:
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_i32_f32_e32
+; SI: s_endpgm
+define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
+  %conv = fptosi float %in to i32
+  store i32 %conv, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs:
+; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
+define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
+  %in.fabs = call float @llvm.fabs.f32(float %in) #0
+  %conv = fptosi float %in.fabs to i32
+  store i32 %conv, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_sint_v2i32:
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
+define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+  %result = fptosi <2 x float> %in to <2 x i32>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_sint_v4i32:
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
+define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %value = load <4 x float>, <4 x float> addrspace(1) * %in
+  %result = fptosi <4 x float> %value to <4 x i32>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_sint_i64:
+
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; Check that the compiler doesn't crash with a "cannot select" error
+; SI: s_endpgm
+define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fptosi float %in to i64
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_sint_v2i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+  %conv = fptosi <2 x float> %x to <2 x i64>
+  store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_sint_v4i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+  %conv = fptosi <4 x float> %x to <4 x i64>
+  store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
new file mode 100644
index 00000000000..41bc2a78001
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
@@ -0,0 +1,70 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: {{^}}fp_to_uint_i32_f64:
+; SI: v_cvt_u32_f64_e32
+define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
+  %cast = fptoui double %in to i32
+  store i32 %cast, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @fp_to_uint_v2i32_v2f64
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+  %cast = fptoui <2 x double> %in to <2 x i32>
+  store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @fp_to_uint_v4i32_v4f64
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+  %cast = fptoui <4 x double> %in to <4 x i32>
+  store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_uint_i64_f64
+; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]]
+; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}}
+; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000
+
+; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}}
+; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]]
+
+; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000
+
+; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]]
+; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
+; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
+; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %val = load double, double addrspace(1)* %gep, align 8
+  %cast = fptoui double %val to i64
+  store i64 %cast, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @fp_to_uint_v2i64_v2f64
+define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) {
+  %cast = fptoui <2 x double> %in to <2 x i64>
+  store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @fp_to_uint_v4i64_v4f64
+define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) {
+  %cast = fptoui <4 x double> %in to <4 x i64>
+  store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.ll b/test/CodeGen/AMDGPU/fp_to_uint.ll
new file mode 100644
index 00000000000..b7b6ccc238b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -0,0 +1,217 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+
+; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i32:
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+
+; SI: v_cvt_u32_f32_e32
+; SI: s_endpgm
+define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
+  %conv = fptoui float %in to i32
+  store i32 %conv, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_uint_v2f32_to_v2i32:
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
+define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+  %result = fptoui <2 x float> %in to <2 x i32>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_uint_v4f32_to_v4i32:
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
+
+define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %value = load <4 x float>, <4 x float> addrspace(1) * %in
+  %result = fptoui <4 x float> %value to <4 x i32>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_uint_f32_to_i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
+  %conv = fptoui float %x to i64
+  store i64 %conv, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_uint_v2f32_to_v2i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+  %conv = fptoui <2 x float> %x to <2 x i64>
+  store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_uint_v4f32_to_v4i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+  %conv = fptoui <4 x float> %x to <4 x i64>
+  store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fpext.ll b/test/CodeGen/AMDGPU/fpext.ll
new file mode 100644
index 00000000000..734a43be229
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fpext.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}fpext_f32_to_f64:
+; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
+  %result = fpext float %in to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64:
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) {
+  %result = fpext <2 x float> %in to <2 x double>
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fpext_v4f32_to_v4f64:
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) {
+  %result = fpext <4 x float> %in to <4 x double>
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fpext_v8f32_to_v8f64:
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) {
+  %result = fpext <8 x float> %in to <8 x double>
+  store <8 x double> %result, <8 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fptrunc.ll b/test/CodeGen/AMDGPU/fptrunc.ll
new file mode 100644
index 00000000000..385e10e7baa
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fptrunc.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}fptrunc_f64_to_f32:
+; SI: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
+  %result = fptrunc double %in to float
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32:
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
+  %result = fptrunc <2 x double> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fptrunc_v4f64_to_v4f32:
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
+  %result = fptrunc <4 x double> %in to <4 x float>
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fptrunc_v8f64_to_v8f32:
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
+  %result = fptrunc <8 x double> %in to <8 x float>
+  store <8 x float> %result, <8 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/frem.ll b/test/CodeGen/AMDGPU/frem.ll
new file mode 100644
index 00000000000..f245ef08cb9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/frem.ll
@@ -0,0 +1,112 @@
+; RUN: llc -march=amdgcn -mcpu=SI -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}frem_f32:
+; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
+; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
+; GCN-DAG: v_cmp
+; GCN-DAG: v_mul_f32
+; GCN: v_rcp_f32_e32
+; GCN: v_mul_f32_e32
+; GCN: v_mul_f32_e32
+; GCN: v_trunc_f32_e32
+; GCN: v_mad_f32
+; GCN: s_endpgm
+define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                      float addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
+   %r0 = load float, float addrspace(1)* %in1, align 4
+   %r1 = load float, float addrspace(1)* %gep2, align 4
+   %r2 = frem float %r0, %r1
+   store float %r2, float addrspace(1)* %out, align 4
+   ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_frem_f32:
+; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
+; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}}
+; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
+; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
+; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
+; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: s_endpgm
+define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                             float addrspace(1)* %in2) #1 {
+   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
+   %r0 = load float, float addrspace(1)* %in1, align 4
+   %r1 = load float, float addrspace(1)* %gep2, align 4
+   %r2 = frem float %r0, %r1
+   store float %r2, float addrspace(1)* %out, align 4
+   ret void
+}
+
+; FUNC-LABEL: {{^}}frem_f64:
+; GCN: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0
+; GCN: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0
+; GCN-DAG: v_div_fmas_f64
+; GCN-DAG: v_div_scale_f64
+; GCN-DAG: v_mul_f64
+; CI: v_trunc_f64_e32
+; CI: v_mul_f64
+; GCN: v_add_f64
+; GCN: buffer_store_dwordx2
+; GCN: s_endpgm
+define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                      double addrspace(1)* %in2) #0 {
+   %r0 = load double, double addrspace(1)* %in1, align 8
+   %r1 = load double, double addrspace(1)* %in2, align 8
+   %r2 = frem double %r0, %r1
+   store double %r2, double addrspace(1)* %out, align 8
+   ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_frem_f64:
+; GCN: v_rcp_f64_e32
+; GCN: v_mul_f64
+; SI: v_bfe_u32
+; CI: v_trunc_f64_e32
+; GCN: v_fma_f64
+; GCN: s_endpgm
+define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                             double addrspace(1)* %in2) #1 {
+   %r0 = load double, double addrspace(1)* %in1, align 8
+   %r1 = load double, double addrspace(1)* %in2, align 8
+   %r2 = frem double %r0, %r1
+   store double %r2, double addrspace(1)* %out, align 8
+   ret void
+}
+
+define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+                        <2 x float> addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
+   %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
+   %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8
+   %r2 = frem <2 x float> %r0, %r1
+   store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8
+   ret void
+}
+
+define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+                        <4 x float> addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
+   %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
+   %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16
+   %r2 = frem <4 x float> %r0, %r1
+   store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16
+   ret void
+}
+
+define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+                        <2 x double> addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
+   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
+   %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16
+   %r2 = frem <2 x double> %r0, %r1
+   store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16
+   ret void
+}
+
+attributes #0 = { nounwind "unsafe-fp-math"="false" }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fsqrt.ll b/test/CodeGen/AMDGPU/fsqrt.ll
new file mode 100644
index 00000000000..04101346cdf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fsqrt.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s
+
+; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
+
+; CHECK: {{^}}fsqrt_f32:
+; CHECK: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
+
+define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+   %r0 = load float, float addrspace(1)* %in
+   %r1 = call float @llvm.sqrt.f32(float %r0)
+   store float %r1, float addrspace(1)* %out
+   ret void
+}
+
+; CHECK: {{^}}fsqrt_f64:
+; CHECK: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+
+define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+   %r0 = load double, double addrspace(1)* %in
+   %r1 = call double @llvm.sqrt.f64(double %r0)
+   store double %r1, double addrspace(1)* %out
+   ret void
+}
+
+declare float @llvm.sqrt.f32(float %Val)
+declare double @llvm.sqrt.f64(double %Val)
diff --git a/test/CodeGen/AMDGPU/fsub.ll b/test/CodeGen/AMDGPU/fsub.ll
new file mode 100644
index 00000000000..dfe41cb5b11
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fsub.ll
@@ -0,0 +1,75 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}v_fsub_f32:
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %a = load float, float addrspace(1)* %in, align 4
+  %b = load float, float addrspace(1)* %b_ptr, align 4
+  %result = fsub float %a, %b
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_fsub_f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W
+
+; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
+  %sub = fsub float %a, %b
+  store float %sub, float addrspace(1)* %out, align 4
+  ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+; FUNC-LABEL: {{^}}fsub_v2f32:
+; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
+; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
+
+; FIXME: Should be using SGPR directly for first operand
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+  %sub = fsub <2 x float> %a, %b
+  store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_fsub_v4f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
+  %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
+  %result = fsub <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FIXME: Should be using SGPR directly for first operand
+
+; FUNC-LABEL: {{^}}s_fsub_v4f32:
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: s_endpgm
+define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
+  %result = fsub <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll
new file mode 100644
index 00000000000..f34a48e30a8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fsub64.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare double @llvm.fabs.f64(double) #0
+
+; SI-LABEL: {{^}}fsub_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                      double addrspace(1)* %in2) {
+  %r0 = load double, double addrspace(1)* %in1
+  %r1 = load double, double addrspace(1)* %in2
+  %r2 = fsub double %r0, %r1
+  store double %r2, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_fabs_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
+define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                           double addrspace(1)* %in2) {
+  %r0 = load double, double addrspace(1)* %in1
+  %r1 = load double, double addrspace(1)* %in2
+  %r1.fabs = call double @llvm.fabs.f64(double %r1) #0
+  %r2 = fsub double %r0, %r1.fabs
+  store double %r2, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_fabs_inv_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}}
+define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                               double addrspace(1)* %in2) {
+  %r0 = load double, double addrspace(1)* %in1
+  %r1 = load double, double addrspace(1)* %in2
+  %r0.fabs = call double @llvm.fabs.f64(double %r0) #0
+  %r2 = fsub double %r0.fabs, %r1
+  store double %r2, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
+  %sub = fsub double %a, %b
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_imm_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], 4.0, -s\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
+  %sub = fsub double 4.0, %a
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_imm_inv_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -4.0, s\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
+  %sub = fsub double %a, 4.0
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_self_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
+  %sub = fsub double %a, %a
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_v2f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) {
+  %sub = fsub <2 x double> %a, %b
+  store <2 x double> %sub, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_v4f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
+  %a = load <4 x double>, <4 x double> addrspace(1)* %in
+  %b = load <4 x double>, <4 x double> addrspace(1)* %b_ptr
+  %result = fsub <4 x double> %a, %b
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_v4f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) {
+  %result = fsub <4 x double> %a, %b
+  store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll
new file mode 100644
index 00000000000..6618d8b5e57
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -0,0 +1,111 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+declare double @llvm.trunc.f64(double) nounwind readnone
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.trunc.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.trunc.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone
+
+; FUNC-LABEL: {{^}}v_ftrunc_f64:
+; CI: v_trunc_f64
+; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11
+; SI: s_endpgm
+define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+  %x = load double, double addrspace(1)* %in, align 8
+  %y = call double @llvm.trunc.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_f64:
+; CI: v_trunc_f64_e32
+
+; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: s_lshr_b64
+; SI: s_not_b64
+; SI: s_and_b64
+; SI: cmp_gt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_lt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: s_endpgm
+define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.trunc.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v2f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f64:
+; FIXME-CI: v_trunc_f64_e32
+; FIXME-CI: v_trunc_f64_e32
+; FIXME-CI: v_trunc_f64_e32
+; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}ftrunc_v4f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v8f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v16f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ftrunc.ll b/test/CodeGen/AMDGPU/ftrunc.ll
new file mode 100644
index 00000000000..edc08609a8a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ftrunc.ll
@@ -0,0 +1,120 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
+
+declare float @llvm.trunc.f32(float) nounwind readnone
+declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
+declare <3 x float> @llvm.trunc.v3f32(<3 x float>) nounwind readnone
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone
+declare <8 x float> @llvm.trunc.v8f32(<8 x float>) nounwind readnone
+declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone
+
+; FUNC-LABEL: {{^}}ftrunc_f32:
+; EG: TRUNC
+; SI: v_trunc_f32_e32
+define void @ftrunc_f32(float addrspace(1)* %out, float %x) {
+  %y = call float @llvm.trunc.f32(float %x) nounwind readnone
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v2f32:
+; EG: TRUNC
+; EG: TRUNC
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+  %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone
+  store <2 x float> %y, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f32:
+; FIXME-EG: TRUNC
+; FIXME-EG: TRUNC
+; FIXME-EG: TRUNC
+; FIXME-SI: v_trunc_f32_e32
+; FIXME-SI: v_trunc_f32_e32
+; FIXME-SI: v_trunc_f32_e32
+; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+;   %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone
+;   store <3 x float> %y, <3 x float> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}ftrunc_v4f32:
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+  %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone
+  store <4 x float> %y, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v8f32:
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+  %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone
+  store <8 x float> %y, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v16f32:
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+  %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone
+  store <16 x float> %y, <16 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/gep-address-space.ll b/test/CodeGen/AMDGPU/gep-address-space.ll
new file mode 100644
index 00000000000..471b0f6b13e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/gep-address-space.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
+
+define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
+; CHECK-LABEL: {{^}}use_gep_address_space:
+; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
+; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64
+  %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16
+  store i32 99, i32 addrspace(3)* %p
+  ret void
+}
+
+define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
+; CHECK-LABEL: {{^}}use_gep_address_space_large_offset:
+; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
+; SI, which is why it is being OR'd with the base pointer.
+; SI: s_or_b32
+; CI: s_add_i32
+; CHECK: ds_write_b32
+  %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
+  store i32 99, i32 addrspace(3)* %p
+  ret void
+}
+
+define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
+; CHECK-LABEL: {{^}}gep_as_vector_v4:
+; CHECK: s_add_i32
+; CHECK: s_add_i32
+; CHECK: s_add_i32
+; CHECK: s_add_i32
+  %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
+  %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
+  %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
+  %p2 = extractelement <4 x i32 addrspace(3)*> %p, i32 2
+  %p3 = extractelement <4 x i32 addrspace(3)*> %p, i32 3
+  store i32 99, i32 addrspace(3)* %p0
+  store i32 99, i32 addrspace(3)* %p1
+  store i32 99, i32 addrspace(3)* %p2
+  store i32 99, i32 addrspace(3)* %p3
+  ret void
+}
+
+define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
+; CHECK-LABEL: {{^}}gep_as_vector_v2:
+; CHECK: s_add_i32
+; CHECK: s_add_i32
+  %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
+  %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
+  %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
+  store i32 99, i32 addrspace(3)* %p0
+  store i32 99, i32 addrspace(3)* %p1
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/global-directive.ll b/test/CodeGen/AMDGPU/global-directive.ll
new file mode 100644
index 00000000000..be775cf9292
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-directive.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; Make sure the GlobalDirective isn't merged with the function name
+
+; SI:	.globl	foo
+; SI: {{^}}foo:
+define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/global-extload-i1.ll b/test/CodeGen/AMDGPU/global-extload-i1.ll
new file mode 100644
index 00000000000..bd9557d730f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-extload-i1.ll
@@ -0,0 +1,302 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; FIXME: Evergreen broken
+
+; FUNC-LABEL: {{^}}zextload_global_i1_to_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @zextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %a = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i1_to_i32:
+; SI: buffer_load_ubyte
+; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @sextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %a = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i32:
+; SI: s_endpgm
+define void @zextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+  %ext = zext <1 x i1> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i32:
+; SI: s_endpgm
+define void @sextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+  %ext = sext <1 x i1> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i32:
+; SI: s_endpgm
+define void @zextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+  %ext = zext <2 x i1> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i32:
+; SI: s_endpgm
+define void @sextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+  %ext = sext <2 x i1> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i32:
+; SI: s_endpgm
+define void @zextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+  %ext = zext <4 x i1> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i32:
+; SI: s_endpgm
+define void @sextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+  %ext = sext <4 x i1> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i32:
+; SI: s_endpgm
+define void @zextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+  %ext = zext <8 x i1> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i32:
+; SI: s_endpgm
+define void @sextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+  %ext = sext <8 x i1> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i32:
+; SI: s_endpgm
+define void @zextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+  %ext = zext <16 x i1> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i32:
+; SI: s_endpgm
+define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+  %ext = sext <16 x i1> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i32:
+; XSI: s_endpgm
+; define void @zextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+;   %ext = zext <32 x i1> %load to <32 x i32>
+;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i32:
+; XSI: s_endpgm
+; define void @sextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+;   %ext = sext <32 x i1> %load to <32 x i32>
+;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i32:
+; XSI: s_endpgm
+; define void @zextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+;   %ext = zext <64 x i1> %load to <64 x i32>
+;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i32:
+; XSI: s_endpgm
+; define void @sextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+;   %ext = sext <64 x i1> %load to <64 x i32>
+;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}zextload_global_i1_to_i64:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; SI: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
+; SI: buffer_store_dwordx2
+define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %a = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i1_to_i64:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
+; SI: buffer_store_dwordx2
+define void @sextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %a = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i64:
+; SI: s_endpgm
+define void @zextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+  %ext = zext <1 x i1> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i64:
+; SI: s_endpgm
+define void @sextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+  %ext = sext <1 x i1> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i64:
+; SI: s_endpgm
+define void @zextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+  %ext = zext <2 x i1> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i64:
+; SI: s_endpgm
+define void @sextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+  %ext = sext <2 x i1> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i64:
+; SI: s_endpgm
+define void @zextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+  %ext = zext <4 x i1> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i64:
+; SI: s_endpgm
+define void @sextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+  %ext = sext <4 x i1> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i64:
+; SI: s_endpgm
+define void @zextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+  %ext = zext <8 x i1> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i64:
+; SI: s_endpgm
+define void @sextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+  %ext = sext <8 x i1> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i64:
+; SI: s_endpgm
+define void @zextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+  %ext = zext <16 x i1> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i64:
+; SI: s_endpgm
+define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+  %ext = sext <16 x i1> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i64:
+; XSI: s_endpgm
+; define void @zextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+;   %ext = zext <32 x i1> %load to <32 x i64>
+;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i64:
+; XSI: s_endpgm
+; define void @sextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+;   %ext = sext <32 x i1> %load to <32 x i64>
+;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i64:
+; XSI: s_endpgm
+; define void @zextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+;   %ext = zext <64 x i1> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i64:
+; XSI: s_endpgm
+; define void @sextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+;   %ext = sext <64 x i1> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll
new file mode 100644
index 00000000000..103a40dee27
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-extload-i16.ll
@@ -0,0 +1,302 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
+
+; FUNC-LABEL: {{^}}zextload_global_i16_to_i32:
+; SI: buffer_load_ushort
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %a = load i16, i16 addrspace(1)* %in
+  %ext = zext i16 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i16_to_i32:
+; SI: buffer_load_sshort
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %a = load i16, i16 addrspace(1)* %in
+  %ext = sext i16 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32:
+; SI: buffer_load_ushort
+; SI: s_endpgm
+define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %ext = zext <1 x i16> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32:
+; SI: buffer_load_sshort
+; SI: s_endpgm
+define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %ext = sext <1 x i16> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32:
+; SI: s_endpgm
+define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %ext = zext <2 x i16> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32:
+; SI: s_endpgm
+define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %ext = sext <2 x i16> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32:
+; SI: s_endpgm
+define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %ext = zext <4 x i16> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32:
+; SI: s_endpgm
+define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %ext = sext <4 x i16> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32:
+; SI: s_endpgm
+define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %ext = zext <8 x i16> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32:
+; SI: s_endpgm
+define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %ext = sext <8 x i16> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32:
+; SI: s_endpgm
+define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %ext = zext <16 x i16> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32:
+; SI: s_endpgm
+define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %ext = sext <16 x i16> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32:
+; SI: s_endpgm
+define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %ext = zext <32 x i16> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32:
+; SI: s_endpgm
+define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %ext = sext <32 x i16> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32:
+; SI: s_endpgm
+define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+  %ext = zext <64 x i16> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32:
+; SI: s_endpgm
+define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+  %ext = sext <64 x i16> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
+; SI: buffer_load_ushort v[[LO:[0-9]+]],
+; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %a = load i16, i16 addrspace(1)* %in
+  %ext = zext i16 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i16_to_i64:
+; SI: buffer_load_sshort [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
+define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %a = load i16, i16 addrspace(1)* %in
+  %ext = sext i16 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64:
+; SI: s_endpgm
+define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %ext = zext <1 x i16> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64:
+; SI: s_endpgm
+define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %ext = sext <1 x i16> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64:
+; SI: s_endpgm
+define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %ext = zext <2 x i16> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64:
+; SI: s_endpgm
+define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %ext = sext <2 x i16> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64:
+; SI: s_endpgm
+define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %ext = zext <4 x i16> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64:
+; SI: s_endpgm
+define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %ext = sext <4 x i16> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64:
+; SI: s_endpgm
+define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %ext = zext <8 x i16> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64:
+; SI: s_endpgm
+define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %ext = sext <8 x i16> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64:
+; SI: s_endpgm
+define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %ext = zext <16 x i16> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64:
+; SI: s_endpgm
+define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %ext = sext <16 x i16> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64:
+; SI: s_endpgm
+define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %ext = zext <32 x i16> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64:
+; SI: s_endpgm
+define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %ext = sext <32 x i16> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64:
+; SI: s_endpgm
+define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+  %ext = zext <64 x i16> %load to <64 x i64>
+  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64:
+; SI: s_endpgm
+define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+  %ext = sext <64 x i16> %load to <64 x i64>
+  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/global-extload-i32.ll b/test/CodeGen/AMDGPU/global-extload-i32.ll
new file mode 100644
index 00000000000..79b83452939
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-extload-i32.ll
@@ -0,0 +1,457 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}zextload_global_i32_to_i64:
+; SI: buffer_load_dword v[[LO:[0-9]+]],
+; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %a = load i32, i32 addrspace(1)* %in
+  %ext = zext i32 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i32_to_i64:
+; SI: buffer_load_dword [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
+define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %a = load i32, i32 addrspace(1)* %in
+  %ext = sext i32 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i32_to_v1i64:
+; SI: buffer_load_dword
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @zextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i32>, <1 x i32> addrspace(1)* %in
+  %ext = zext <1 x i32> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i32_to_v1i64:
+; SI: buffer_load_dword
+; SI: v_ashrrev_i32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i32>, <1 x i32> addrspace(1)* %in
+  %ext = sext <1 x i32> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i32_to_v2i64:
+; SI: buffer_load_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %ext = zext <2 x i32> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i32_to_v2i64:
+; SI: buffer_load_dwordx2
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %ext = sext <2 x i32> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i32_to_v4i64:
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %ext = zext <4 x i32> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i32_to_v4i64:
+; SI: buffer_load_dwordx4
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %ext = sext <4 x i32> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI: s_endpgm
+define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i32>, <8 x i32> addrspace(1)* %in
+  %ext = zext <8 x i32> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI: s_endpgm
+define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i32>, <8 x i32> addrspace(1)* %in
+  %ext = sext <8 x i32> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i32>, <16 x i32> addrspace(1)* %in
+  %ext = sext <16 x i32> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+
+; SI: s_endpgm
+define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i32>, <16 x i32> addrspace(1)* %in
+  %ext = zext <16 x i32> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI: s_endpgm
+define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i32>, <32 x i32> addrspace(1)* %in
+  %ext = sext <32 x i32> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI: s_endpgm
+define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i32>, <32 x i32> addrspace(1)* %in
+  %ext = zext <32 x i32> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/global-extload-i8.ll b/test/CodeGen/AMDGPU/global-extload-i8.ll
new file mode 100644
index 00000000000..b31d5361d5a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-extload-i8.ll
@@ -0,0 +1,299 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}zextload_global_i8_to_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @zextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %a = load i8, i8 addrspace(1)* %in
+  %ext = zext i8 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i8_to_i32:
+; SI: buffer_load_sbyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @sextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %a = load i8, i8 addrspace(1)* %in
+  %ext = sext i8 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i32:
+; SI: s_endpgm
+define void @zextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+  %ext = zext <1 x i8> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i32:
+; SI: s_endpgm
+define void @sextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+  %ext = sext <1 x i8> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i32:
+; SI: s_endpgm
+define void @zextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %ext = zext <2 x i8> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i32:
+; SI: s_endpgm
+define void @sextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %ext = sext <2 x i8> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i32:
+; SI: s_endpgm
+define void @zextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %ext = zext <4 x i8> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i32:
+; SI: s_endpgm
+define void @sextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %ext = sext <4 x i8> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i32:
+; SI: s_endpgm
+define void @zextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  %ext = zext <8 x i8> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i32:
+; SI: s_endpgm
+define void @sextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  %ext = sext <8 x i8> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i32:
+; SI: s_endpgm
+define void @zextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  %ext = zext <16 x i8> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i32:
+; SI: s_endpgm
+define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  %ext = sext <16 x i8> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i32:
+; XSI: s_endpgm
+; define void @zextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+;   %ext = zext <32 x i8> %load to <32 x i32>
+;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i32:
+; XSI: s_endpgm
+; define void @sextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+;   %ext = sext <32 x i8> %load to <32 x i32>
+;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i32:
+; XSI: s_endpgm
+; define void @zextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+;   %ext = zext <64 x i8> %load to <64 x i32>
+;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i32:
+; XSI: s_endpgm
+; define void @sextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+;   %ext = sext <64 x i8> %load to <64 x i32>
+;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}zextload_global_i8_to_i64:
+; SI: buffer_load_ubyte v[[LO:[0-9]+]],
+; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %a = load i8, i8 addrspace(1)* %in
+  %ext = zext i8 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i8_to_i64:
+; SI: buffer_load_sbyte [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
+define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %a = load i8, i8 addrspace(1)* %in
+  %ext = sext i8 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i64:
+; SI: s_endpgm
+define void @zextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+  %ext = zext <1 x i8> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i64:
+; SI: s_endpgm
+define void @sextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+  %ext = sext <1 x i8> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i64:
+; SI: s_endpgm
+define void @zextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %ext = zext <2 x i8> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i64:
+; SI: s_endpgm
+define void @sextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %ext = sext <2 x i8> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i64:
+; SI: s_endpgm
+define void @zextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %ext = zext <4 x i8> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i64:
+; SI: s_endpgm
+define void @sextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %ext = sext <4 x i8> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i64:
+; SI: s_endpgm
+define void @zextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  %ext = zext <8 x i8> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i64:
+; SI: s_endpgm
+define void @sextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  %ext = sext <8 x i8> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i64:
+; SI: s_endpgm
+define void @zextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  %ext = zext <16 x i8> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i64:
+; SI: s_endpgm
+define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  %ext = sext <16 x i8> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i64:
+; XSI: s_endpgm
+; define void @zextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+;   %ext = zext <32 x i8> %load to <32 x i64>
+;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i64:
+; XSI: s_endpgm
+; define void @sextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+;   %ext = sext <32 x i8> %load to <32 x i64>
+;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i64:
+; XSI: s_endpgm
+; define void @zextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+;   %ext = zext <64 x i8> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i64:
+; XSI: s_endpgm
+; define void @sextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+;   %ext = sext <64 x i8> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
diff --git a/test/CodeGen/AMDGPU/global-zero-initializer.ll b/test/CodeGen/AMDGPU/global-zero-initializer.ll
new file mode 100644
index 00000000000..45aa8bf4e1d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-zero-initializer.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported initializer for address space in load_init_global_global
+
+@lds = addrspace(1) global [256 x i32] zeroinitializer
+
+define void @load_init_global_global(i32 addrspace(1)* %out, i1 %p) {
+ %gep = getelementptr [256 x i32], [256 x i32] addrspace(1)* @lds, i32 0, i32 10
+  %ld = load i32, i32 addrspace(1)* %gep
+  store i32 %ld, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/global_atomics.ll b/test/CodeGen/AMDGPU/global_atomics.ll
new file mode 100644
index 00000000000..847950f6376
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global_atomics.ll
@@ -0,0 +1,801 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}atomic_add_i32_offset:
+; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
+; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32:
+; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
+; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_offset:
+; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
+; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32:
+; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
+; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
+; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
+; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32:
+; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
+; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_offset:
+; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
+; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32:
+; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
+; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
+; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
+; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32:
+; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
+; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_offset:
+; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
+; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32:
+; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
+; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
+; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
+; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32:
+; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
+; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_offset:
+; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
+; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32:
+; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
+; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
+; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
+; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32:
+; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
+; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
+; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
+; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32:
+; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
+; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll b/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll
new file mode 100644
index 00000000000..014b0a5482a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+@a = internal addrspace(2) constant [1 x i8] [ i8 7 ], align 1
+
+; FUNC-LABEL: {{^}}test_i8:
+; EG: CF_END
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 {
+  %arrayidx = getelementptr inbounds [1 x i8], [1 x i8] addrspace(2)* @a, i32 0, i32 %s
+  %1 = load i8, i8 addrspace(2)* %arrayidx, align 1
+  store i8 %1, i8 addrspace(1)* %out
+  ret void
+}
+
+@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
+
+; FUNC-LABEL: {{^}}test_i16:
+; EG: CF_END
+; SI: buffer_store_short
+; SI: s_endpgm
+define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 {
+  %arrayidx = getelementptr inbounds [1 x i16], [1 x i16] addrspace(2)* @b, i32 0, i32 %s
+  %1 = load i16, i16 addrspace(2)* %arrayidx, align 2
+  store i16 %1, i16 addrspace(1)* %out
+  ret void
+}
+
+%struct.bar = type { float, [5 x i8] }
+
+; The illegal i8s aren't handled
+@struct_bar_gv = internal addrspace(2) constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ]
+
+; FUNC-LABEL: {{^}}struct_bar_gv_load:
+define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [1 x %struct.bar], [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index
+  %load = load i8, i8 addrspace(2)* %gep, align 1
+  store i8 %load, i8 addrspace(1)* %out, align 1
+  ret void
+}
+
+
+; The private load isn't scalarzied.
+@array_vector_gv = internal addrspace(2) constant [4 x <4 x i32>] [ <4 x i32> <i32 1, i32 2, i32 3, i32 4>,
+                                                                    <4 x i32> <i32 5, i32 6, i32 7, i32 8>,
+                                                                    <4 x i32> <i32 9, i32 10, i32 11, i32 12>,
+                                                                    <4 x i32> <i32 13, i32 14, i32 15, i32 16> ]
+
+; FUNC-LABEL: {{^}}array_vector_gv_load:
+define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index
+  %load = load <4 x i32>, <4 x i32> addrspace(2)* %gep, align 16
+  store <4 x i32> %load, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
new file mode 100644
index 00000000000..3c1fc6c98f7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
@@ -0,0 +1,101 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
+
+@float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
+
+; FUNC-LABEL: {{^}}float:
+; FIXME: We should be using s_load_dword here.
+; SI: buffer_load_dword
+; VI: s_load_dword
+
+; EG-DAG: MOV {{\** *}}T2.X
+; EG-DAG: MOV {{\** *}}T3.X
+; EG-DAG: MOV {{\** *}}T4.X
+; EG-DAG: MOV {{\** *}}T5.X
+; EG-DAG: MOV {{\** *}}T6.X
+; EG: MOVA_INT
+
+define void @float(float addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+  %1 = load float, float addrspace(2)* %0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+@i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
+
+; FUNC-LABEL: {{^}}i32:
+
+; FIXME: We should be using s_load_dword here.
+; SI: buffer_load_dword
+; VI: s_load_dword
+
+; EG-DAG: MOV {{\** *}}T2.X
+; EG-DAG: MOV {{\** *}}T3.X
+; EG-DAG: MOV {{\** *}}T4.X
+; EG-DAG: MOV {{\** *}}T5.X
+; EG-DAG: MOV {{\** *}}T6.X
+; EG: MOVA_INT
+
+define void @i32(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+
+%struct.foo = type { float, [5 x i32] }
+
+@struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
+
+; FUNC-LABEL: {{^}}struct_foo_gv_load:
+; GCN: s_load_dword
+
+define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
+  %load = load i32, i32 addrspace(2)* %gep, align 4
+  store i32 %load, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+@array_v1_gv = internal addrspace(2) constant [4 x <1 x i32>] [ <1 x i32> <i32 1>,
+                                                                <1 x i32> <i32 2>,
+                                                                <1 x i32> <i32 3>,
+                                                                <1 x i32> <i32 4> ]
+
+; FUNC-LABEL: {{^}}array_v1_gv_load:
+; FIXME: We should be using s_load_dword here.
+; SI: buffer_load_dword
+; VI: s_load_dword
+define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
+  %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4
+  store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
+entry:
+  %0 = icmp eq i32 0, %a
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+  %2 = load float, float addrspace(2)* %1
+  store float %2, float addrspace(1)* %out
+  br label %endif
+
+else:
+  store float 1.0, float addrspace(1)* %out
+  br label %endif
+
+endif:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll
new file mode 100644
index 00000000000..bf8f11860b5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/half.ll
@@ -0,0 +1,525 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; half args should be promoted to float
+
+; GCN-LABEL: {{^}}load_f16_arg:
+; GCN: s_load_dword [[ARG:s[0-9]+]]
+; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]]
+; GCN: buffer_store_short [[CVT]]
+define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
+  store half %arg, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_v2f16_arg:
+; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
+; GCN-DAG: buffer_store_short [[V0]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_short [[V1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; GCN: s_endpgm
+define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
+  store <2 x half> %arg, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_v3f16_arg:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN-NOT: buffer_load
+; GCN-DAG: buffer_store_dword
+; GCN-DAG: buffer_store_short
+; GCN-NOT: buffer_store
+; GCN: s_endpgm
+define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
+  store <3 x half> %arg, <3 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_v4f16_arg:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: s_endpgm
+define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
+  store <4 x half> %arg, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_v8f16_arg:
+define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
+  store <8 x half> %arg, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v2f16_arg:
+define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
+  %fpext = fpext <2 x half> %in to <2 x float>
+  store <2 x float> %fpext, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_f16_to_f32_arg:
+define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
+  %ext = fpext half %arg to float
+  store float %ext, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg:
+define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
+  %ext = fpext <2 x half> %arg to <2 x float>
+  store <2 x float> %ext, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN-NOT: buffer_load
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN-NOT: v_cvt_f32_f16
+; GCN-DAG: buffer_store_dword
+; GCN-DAG: buffer_store_dwordx2
+; GCN: s_endpgm
+define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
+  %ext = fpext <3 x half> %arg to <3 x float>
+  store <3 x float> %ext, <3 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg:
+define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
+  %ext = fpext <4 x half> %arg to <4 x float>
+  store <4 x float> %ext, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
+define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
+  %ext = fpext <8 x half> %arg to <8 x float>
+  store <8 x float> %ext, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_f16_to_f64_arg:
+define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
+  %ext = fpext half %arg to double
+  store double %ext, double addrspace(1)* %out
+  ret void
+}
+; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
+define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
+  %ext = fpext <2 x half> %arg to <2 x double>
+  store <2 x double> %ext, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
+define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
+  %ext = fpext <3 x half> %arg to <3 x double>
+  store <3 x double> %ext, <3 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
+define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
+  %ext = fpext <4 x half> %arg to <4 x double>
+  store <4 x double> %ext, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
+define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
+  %ext = fpext <8 x half> %arg to <8 x double>
+  store <8 x double> %ext, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_load_store_f16:
+; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
+; GCN: buffer_store_short [[TMP]]
+define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+  %val = load half, half addrspace(1)* %in
+  store half %val, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_load_store_v2f16:
+; GCN: buffer_load_dword [[TMP:v[0-9]+]]
+; GCN: buffer_store_dword [[TMP]]
+define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  store <2 x half> %val, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_load_store_v4f16:
+; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]]
+; GCN: buffer_store_dwordx2 [[TMP]]
+define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
+  %val = load <4 x half>, <4 x half> addrspace(1)* %in
+  store <4 x half> %val, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_load_store_v8f16:
+; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
+; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
+; GCN: s_endpgm
+define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+  %val = load <8 x half>, <8 x half> addrspace(1)* %in
+  store <8 x half> %val, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_f16_to_f32:
+; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]]
+; GCN: buffer_store_dword [[CVT]]
+define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
+  %val = load half, half addrspace(1)* %in
+  %cvt = fpext half %val to float
+  store float %cvt, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
+define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  %cvt = fpext <2 x half> %val to <2 x float>
+  store <2 x float> %cvt, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32:
+define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
+  %val = load <3 x half>, <3 x half> addrspace(1)* %in
+  %cvt = fpext <3 x half> %val to <3 x float>
+  store <3 x float> %cvt, <3 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32:
+define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+  %val = load <4 x half>, <4 x half> addrspace(1)* %in
+  %cvt = fpext <4 x half> %val to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32:
+define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+  %val = load <8 x half>, <8 x half> addrspace(1)* %in
+  %cvt = fpext <8 x half> %val to <8 x float>
+  store <8 x float> %cvt, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32:
+define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
+  %val = load <16 x half>, <16 x half> addrspace(1)* %in
+  %cvt = fpext <16 x half> %val to <16 x float>
+  store <16 x float> %cvt, <16 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_f16_to_f64:
+; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]]
+; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]]
+; GCN: buffer_store_dwordx2 [[CVT1]]
+define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
+  %val = load half, half addrspace(1)* %in
+  %cvt = fpext half %val to double
+  store double %cvt, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64:
+define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  %cvt = fpext <2 x half> %val to <2 x double>
+  store <2 x double> %cvt, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64:
+define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
+  %val = load <3 x half>, <3 x half> addrspace(1)* %in
+  %cvt = fpext <3 x half> %val to <3 x double>
+  store <3 x double> %cvt, <3 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64:
+define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+  %val = load <4 x half>, <4 x half> addrspace(1)* %in
+  %cvt = fpext <4 x half> %val to <4 x double>
+  store <4 x double> %cvt, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64:
+define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+  %val = load <8 x half>, <8 x half> addrspace(1)* %in
+  %cvt = fpext <8 x half> %val to <8 x double>
+  store <8 x double> %cvt, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64:
+define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
+  %val = load <16 x half>, <16 x half> addrspace(1)* %in
+  %cvt = fpext <16 x half> %val to <16 x double>
+  store <16 x double> %cvt, <16 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_f32_to_f16:
+; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
+; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]]
+; GCN: buffer_store_short [[CVT]]
+define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %val = load float, float addrspace(1)* %in
+  %cvt = fptrunc float %val to half
+  store half %cvt, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16:
+; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]]
+; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
+; GCN-DAG: buffer_store_short [[CVT0]]
+; GCN-DAG: buffer_store_short [[CVT1]]
+; GCN: s_endpgm
+define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+  %val = load <2 x float>, <2 x float> addrspace(1)* %in
+  %cvt = fptrunc <2 x float> %val to <2 x half>
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Shouldn't do 4th conversion
+; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16:
+; GCN: buffer_load_dwordx4
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: buffer_store_short
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+  %val = load <3 x float>, <3 x float> addrspace(1)* %in
+  %cvt = fptrunc <3 x float> %val to <3 x half>
+  store <3 x half> %cvt, <3 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16:
+; GCN: buffer_load_dwordx4
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: s_endpgm
+define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+  %val = load <4 x float>, <4 x float> addrspace(1)* %in
+  %cvt = fptrunc <4 x float> %val to <4 x half>
+  store <4 x half> %cvt, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16:
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: s_endpgm
+define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
+  %val = load <8 x float>, <8 x float> addrspace(1)* %in
+  %cvt = fptrunc <8 x float> %val to <8 x half>
+  store <8 x half> %cvt, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16:
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: s_endpgm
+define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
+  %val = load <16 x float>, <16 x float> addrspace(1)* %in
+  %cvt = fptrunc <16 x float> %val to <16 x half>
+  store <16 x half> %cvt, <16 x half> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Unsafe math should fold conversions away
+; GCN-LABEL: {{^}}fadd_f16:
+; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
+; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
+; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
+; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
+; SI: v_add_f32
+; GCN: s_endpgm
+define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
+   %add = fadd half %a, %b
+   store half %add, half addrspace(1)* %out, align 4
+   ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v2f16:
+; SI: v_add_f32
+; SI: v_add_f32
+; GCN: s_endpgm
+define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
+  %add = fadd <2 x half> %a, %b
+  store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v4f16:
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; GCN: s_endpgm
+define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+  %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
+  %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
+  %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
+  %result = fadd <4 x half> %a, %b
+  store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v8f16:
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; GCN: s_endpgm
+define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
+  %add = fadd <8 x half> %a, %b
+  store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
+  ret void
+}
+
+; GCN-LABEL: {{^}}fsub_f16:
+; GCN: v_subrev_f32_e32
+; GCN: s_endpgm
+define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+  %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1
+  %a = load half, half addrspace(1)* %in
+  %b = load half, half addrspace(1)* %b_ptr
+  %sub = fsub half %a, %b
+  store half %sub, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_bitcast_from_half:
+; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
+; GCN: buffer_store_short [[TMP]]
+define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
+  %val = load half, half addrspace(1)* %in
+  %val_int = bitcast half %val to i16
+  store i16 %val_int, i16 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_bitcast_to_half:
+; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
+; GCN: buffer_store_short [[TMP]]
+define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+  %val = load i16, i16 addrspace(1)* %in
+  %val_fp = bitcast i16 %val to half
+  store half %val_fp, half addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
new file mode 100644
index 00000000000..f9113399afe
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
+
+; HSA: .section        .hsa.version
+; HSA-NEXT: .ascii  "HSA Code Unit:0.0:AMD:0.1:GFX8.1:0"
+; HSA: {{^}}simple:
+; Make sure we are setting the ATC bit:
+; HSA: s_mov_b32 s[[HI:[0-9]]], 0x100f000
+; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0
+
+define void @simple(i32 addrspace(1)* %out) {
+entry:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
new file mode 100644
index 00000000000..b11a2113764
--- /dev/null
+++ b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SILowerI1Copies was not handling IMPLICIT_DEF
+; SI-LABEL: {{^}}br_implicit_def:
+; SI: BB#0:
+; SI-NEXT: s_and_saveexec_b64
+; SI-NEXT: s_xor_b64
+; SI-NEXT: BB#1:
+define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
+bb:
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  store volatile i32 123, i32 addrspace(1)* %out
+  ret void
+
+bb2:
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll
new file mode 100644
index 00000000000..105cd06b330
--- /dev/null
+++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}br_i1_phi:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; SI: s_and_saveexec_b64
+; SI: s_xor_b64
+; SI: v_mov_b32_e32 [[REG]], -1{{$}}
+; SI: v_cmp_ne_i32_e32 vcc, 0, [[REG]]
+; SI: s_and_saveexec_b64
+; SI: s_xor_b64
+; SI: s_endpgm
+define void @br_i1_phi(i32 %arg, i1 %arg1) #0 {
+bb:
+  br i1 %arg1, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  %tmp = phi i1 [ true, %bb2 ], [ false, %bb ]
+  br i1 %tmp, label %bb4, label %bb6
+
+bb4:                                              ; preds = %bb3
+  %tmp5 = mul i32 undef, %arg
+  br label %bb6
+
+bb6:                                              ; preds = %bb4, %bb3
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/i8-to-double-to-float.ll b/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
new file mode 100644
index 00000000000..c218e1918bb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
@@ -0,0 +1,11 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %1 = load i8, i8 addrspace(1)* %in
+  %2 = uitofp i8 %1 to double
+  %3 = fptrunc double %2 to float
+  store float %3, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll b/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
new file mode 100644
index 00000000000..60e59a5a528
--- /dev/null
+++ b/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
@@ -0,0 +1,18 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;Test that a select with reversed True/False values is correctly lowered
+;to a SETNE_INT.  There should only be one SETNE_INT instruction.
+
+;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK-NOT: SETNE_INT
+
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(1)* %in
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx1
+  %cmp = icmp eq i32 %0, %1
+  %value = select i1 %cmp, i32 0, i32 -1
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/icmp64.ll b/test/CodeGen/AMDGPU/icmp64.ll
new file mode 100644
index 00000000000..0eaa33ebafe
--- /dev/null
+++ b/test/CodeGen/AMDGPU/icmp64.ll
@@ -0,0 +1,93 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}test_i64_eq:
+; SI: v_cmp_eq_i64
+define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp eq i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_ne:
+; SI: v_cmp_ne_i64
+define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ne i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_slt:
+; SI: v_cmp_lt_i64
+define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp slt i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_ult:
+; SI: v_cmp_lt_u64
+define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ult i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_sle:
+; SI: v_cmp_le_i64
+define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp sle i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_ule:
+; SI: v_cmp_le_u64
+define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ule i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_sgt:
+; SI: v_cmp_gt_i64
+define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp sgt i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_ugt:
+; SI: v_cmp_gt_u64
+define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ugt i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_sge:
+; SI: v_cmp_ge_i64
+define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp sge i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_uge:
+; SI: v_cmp_ge_u64
+define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp uge i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll
new file mode 100644
index 00000000000..12eed550eb1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/imm.ll
@@ -0,0 +1,617 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CHECK %s
+
+; Use a 64-bit value with lo bits that can be represented as an inline constant
+; CHECK-LABEL: {{^}}i64_imm_inline_lo:
+; CHECK: s_mov_b32 [[LO:s[0-9]+]], 5
+; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], [[LO]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]:
+define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
+entry:
+  store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005
+  ret void
+}
+
+; Use a 64-bit value with hi bits that can be represented as an inline constant
+; CHECK-LABEL: {{^}}i64_imm_inline_hi:
+; CHECK: s_mov_b32 [[HI:s[0-9]+]], 5
+; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], [[HI]]
+; CHECK: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]]
+define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
+entry:
+  store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64:
+; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
+; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
+  store i64 -9223372036854775808, i64 addrspace(1) *%out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_neg_0.0_i32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
+  store i32 -2147483648, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
+  store float 0.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_imm_neg_0.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; CHECK: buffer_store_dword [[REG]]
+define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
+  store float -0.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.5_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
+  store float 0.5, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
+  store float -0.5, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_1.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
+  store float 1.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
+  store float -1.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_2.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
+  store float 2.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
+  store float -2.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_4.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
+  store float 4.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
+  store float -4.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_literal_imm_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000
+; CHECK: buffer_store_dword [[REG]]
+define void @store_literal_imm_f32(float addrspace(1)* %out) {
+  store float 4096.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.0_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.5_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0.5
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -0.5
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_1.0_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 1.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -1.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_2.0_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 2.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -2.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_4.0_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 4.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -4.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}commute_add_inline_imm_0.5_f32:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %x = load float, float addrspace(1)* %in
+  %y = fadd float %x, 0.5
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}commute_add_literal_f32:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %x = load float, float addrspace(1)* %in
+  %y = fadd float %x, 1024.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_1_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x36a0000000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_2_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x36b0000000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_16_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 16, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x36e0000000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0xffffffffe0000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0xffffffffc0000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -16, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0xfffffffe00000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_63_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 63, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x36ff800000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_64_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 64, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x3700000000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.5_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0.5, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0.5
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -0.5, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, -0.5
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_1.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 1.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, -1.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_2.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 2.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, -2.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_4.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 4.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 4.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -4.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, -4.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}add_inline_imm_1_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x0000000000000001
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_2_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x0000000000000002
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_16_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 16, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x0000000000000010
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0xffffffffffffffff
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0xfffffffffffffffe
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -16, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0xfffffffffffffff0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_63_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 63, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x000000000000003F
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_64_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 64, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x0000000000000040
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.0_f64:
+; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0
+; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
+  store double 0.0, double addrspace(1)* %out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64:
+; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
+; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
+  store double -0.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.5_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
+  store double 0.5, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
+  store double -0.5, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_1.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
+  store double 1.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
+  store double -1.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_2.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
+  store double 2.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
+  store double -2.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_4.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
+  store double 4.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
+  store double -4.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_literal_imm_f64:
+; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x40b00000
+; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_literal_imm_f64(double addrspace(1)* %out) {
+  store double 4096.0, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
new file mode 100644
index 00000000000..f551606d63a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -0,0 +1,121 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; Tests for indirect addressing on SI, which is implemented using dynamic
+; indexing of vectors.
+
+; CHECK-LABEL: {{^}}extract_w_offset:
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movrels_b32_e32
+define void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = add i32 %in, 1
+  %1 = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}extract_wo_offset:
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movrels_b32_e32
+define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}extract_neg_offset_sgpr:
+; The offset depends on the register that holds the first element of the vector.
+; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
+; CHECK: v_movrels_b32_e32 v{{[0-9]}}, v0
+define void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
+entry:
+  %index = add i32 %offset, -512
+  %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}extract_neg_offset_vgpr:
+; The offset depends on the register that holds the first element of the vector.
+; CHECK: v_readfirstlane_b32
+; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}}
+; CHECK-NEXT: v_movrels_b32_e32 v{{[0-9]}}, v0
+; CHECK: s_cbranch_execnz
+define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
+entry:
+  %id = call i32 @llvm.r600.read.tidig.x() #1
+  %index = add i32 %id, -512
+  %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_w_offset:
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movreld_b32_e32
+define void @insert_w_offset(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = add i32 %in, 1
+  %1 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %0
+  %2 = extractelement <4 x float> %1, i32 2
+  store float %2, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_wo_offset:
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movreld_b32_e32
+define void @insert_wo_offset(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
+  %1 = extractelement <4 x float> %0, i32 2
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_neg_offset_sgpr:
+; The offset depends on the register that holds the first element of the vector.
+; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
+; CHECK: v_movreld_b32_e32 v0, v{{[0-9]}}
+define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
+entry:
+  %index = add i32 %offset, -512
+  %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
+  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_neg_offset_vgpr:
+; The offset depends on the register that holds the first element of the vector.
+; CHECK: v_readfirstlane_b32
+; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}}
+; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}}
+; CHECK: s_cbranch_execnz
+define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %id = call i32 @llvm.r600.read.tidig.x() #1
+  %index = add i32 %id, -512
+  %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
+  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_neg_inline_offset_vgpr:
+; The offset depends on the register that holds the first element of the vector.
+; CHECK: v_readfirstlane_b32
+; CHECK: s_add_i32 m0, m0, -{{[0-9]+}}
+; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}}
+; CHECK: s_cbranch_execnz
+define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %id = call i32 @llvm.r600.read.tidig.x() #1
+  %index = add i32 %id, -16
+  %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
+  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/indirect-private-64.ll b/test/CodeGen/AMDGPU/indirect-private-64.ll
new file mode 100644
index 00000000000..d63e1b6c521
--- /dev/null
+++ b/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -0,0 +1,91 @@
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+
+
+declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+
+; SI-LABEL: {{^}}private_access_f64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx2
+; SI-ALLOCA: buffer_load_dwordx2
+
+; SI-PROMOTE: ds_write_b64
+; SI-PROMOTE: ds_read_b64
+define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
+  %val = load double, double addrspace(1)* %in, align 8
+  %array = alloca double, i32 16, align 8
+  %ptr = getelementptr double, double* %array, i32 %b
+  store double %val, double* %ptr, align 8
+  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  %result = load double, double* %ptr, align 8
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}private_access_v2f64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx4
+; SI-ALLOCA: buffer_load_dwordx4
+
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
+  %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
+  %array = alloca <2 x double>, i32 16, align 16
+  %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b
+  store <2 x double> %val, <2 x double>* %ptr, align 16
+  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  %result = load <2 x double>, <2 x double>* %ptr, align 16
+  store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}private_access_i64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx2
+; SI-ALLOCA: buffer_load_dwordx2
+
+; SI-PROMOTE: ds_write_b64
+; SI-PROMOTE: ds_read_b64
+define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
+  %val = load i64, i64 addrspace(1)* %in, align 8
+  %array = alloca i64, i32 16, align 8
+  %ptr = getelementptr i64, i64* %array, i32 %b
+  store i64 %val, i64* %ptr, align 8
+  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  %result = load i64, i64* %ptr, align 8
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}private_access_v2i64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx4
+; SI-ALLOCA: buffer_load_dwordx4
+
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
+  %array = alloca <2 x i64>, i32 16, align 16
+  %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b
+  store <2 x i64> %val, <2 x i64>* %ptr, align 16
+  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  %result = load <2 x i64>, <2 x i64>* %ptr, align 16
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll b/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
new file mode 100644
index 00000000000..f6e39b3d830
--- /dev/null
+++ b/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
@@ -0,0 +1,10 @@
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
+
+define void @inf_loop_irreducible_cfg() nounwind {
+entry:
+  br label %block
+
+block:
+  br label %block
+}
diff --git a/test/CodeGen/AMDGPU/infinite-loop.ll b/test/CodeGen/AMDGPU/infinite-loop.ll
new file mode 100644
index 00000000000..7233aa57fd7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}infinite_loop:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
+; SI: BB0_1:
+; SI: buffer_store_dword [[REG]]
+; SI: s_waitcnt vmcnt(0) expcnt(0)
+; SI: s_branch BB0_1
+define void @infinite_loop(i32 addrspace(1)* %out) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  store i32 999, i32 addrspace(1)* %out, align 4
+  br label %for.body
+}
+
diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll
new file mode 100644
index 00000000000..efc2292de3a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK: {{^}}inline_asm:
+; CHECK: s_endpgm
+; CHECK: s_endpgm
+define void @inline_asm(i32 addrspace(1)* %out) {
+entry:
+  store i32 5, i32 addrspace(1)* %out
+  call void asm sideeffect "s_endpgm", ""()
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/inline-calls.ll b/test/CodeGen/AMDGPU/inline-calls.ll
new file mode 100644
index 00000000000..33a4c832e75
--- /dev/null
+++ b/test/CodeGen/AMDGPU/inline-calls.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck  %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck  %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-NOT: {{^}}func:
+define internal fastcc i32 @func(i32 %a) {
+entry:
+  %tmp0 = add i32 %a, 1
+  ret i32 %tmp0
+}
+
+; CHECK: {{^}}kernel:
+define void @kernel(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = call i32 @func(i32 1)
+  store i32 %tmp0, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}kernel2:
+define void @kernel2(i32 addrspace(1)* %out) {
+entry:
+  call void @kernel(i32 addrspace(1)* %out)
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/input-mods.ll b/test/CodeGen/AMDGPU/input-mods.ll
new file mode 100644
index 00000000000..1c4d285cbcb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/input-mods.ll
@@ -0,0 +1,26 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG
+;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM
+
+;EG-LABEL: {{^}}test:
+;EG: EXP_IEEE *
+;CM-LABEL: {{^}}test:
+;CM: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X|
+;CM: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X|
+;CM: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X|
+;CM: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X|
+
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = call float @llvm.fabs.f32(float %r0)
+   %r2 = fsub float -0.000000e+00, %r1
+   %r3 = call float @llvm.exp2.f32(float %r2)
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare float @llvm.exp2.f32(float) readnone
+declare float @llvm.fabs.f32(float) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/insert_subreg.ll b/test/CodeGen/AMDGPU/insert_subreg.ll
new file mode 100644
index 00000000000..4a5e8869c2d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/insert_subreg.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s
+
+; Test that INSERT_SUBREG instructions don't have non-register operands after
+; instruction selection.
+
+; Make sure this doesn't crash
+; CHECK-LABEL: test:
+define void @test(i64 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca [16 x i32]
+  %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32
+  %tmp2 = sext i32 %tmp1 to i64
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
new file mode 100644
index 00000000000..6de3d408c48
--- /dev/null
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -0,0 +1,252 @@
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+
+; FIXME: Broken on evergreen
+; FIXME: For some reason the 8 and 16 vectors are being stored as
+; individual elements instead of 128-bit stores.
+
+
+; FIXME: Why is the constant moved into the intermediate register and
+; not just directly into the vector component?
+
+; SI-LABEL: {{^}}insertelement_v4f32_0:
+; s_load_dwordx4 s{{[}}[[LOW_REG:[0-9]+]]:
+; v_mov_b32_e32
+; v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00
+; v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]]
+; buffer_store_dwordx4 v{{[}}[[LOW_REG]]:
+define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}insertelement_v4f32_1:
+define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}insertelement_v4f32_2:
+define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}insertelement_v4f32_3:
+define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}insertelement_v4i32_0:
+define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
+  %vecins = insertelement <4 x i32> %a, i32 999, i32 0
+  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2f32:
+; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; SI: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
+define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
+  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v4f32:
+; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; SI: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
+define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v8f32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
+  %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
+  store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v16f32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
+  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
+  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2i32:
+; SI: buffer_store_dwordx2
+define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
+  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v4i32:
+; SI: buffer_store_dwordx4
+define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x i32> %a, i32 5, i32 %b
+  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v8i32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
+  %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
+  store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v16i32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
+  %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
+  store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
+  ret void
+}
+
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2i16:
+; FIXMESI: buffer_store_dwordx2
+define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v4i16:
+; FIXMESI: buffer_store_dwordx4
+define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
+  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16
+  ret void
+}
+
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2i8:
+; FIXMESI: BUFFER_STORE_USHORT
+define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
+  store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v4i8:
+; FIXMESI: buffer_store_dword
+define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
+  store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v8i8:
+; FIXMESI: buffer_store_dwordx2
+define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
+  %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
+  store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v16i8:
+; FIXMESI: buffer_store_dwordx4
+define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
+  %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
+  store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
+  ret void
+}
+
+; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
+; the compiler doesn't crash.
+; SI-LABEL: {{^}}insert_split_bb:
+define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
+entry:
+  %0 = insertelement <2 x i32> undef, i32 %a, i32 0
+  %1 = icmp eq i32 %a, 0
+  br i1 %1, label %if, label %else
+
+if:
+  %2 = load i32, i32 addrspace(1)* %in
+  %3 = insertelement <2 x i32> %0, i32 %2, i32 1
+  br label %endif
+
+else:
+  %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %5 = load i32, i32 addrspace(1)* %4
+  %6 = insertelement <2 x i32> %0, i32 %5, i32 1
+  br label %endif
+
+endif:
+  %7 = phi <2 x i32> [%3, %if], [%6, %else]
+  store <2 x i32> %7, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2f64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
+  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2i64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
+  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v4f64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
+  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v8f64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
+  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/jump-address.ll b/test/CodeGen/AMDGPU/jump-address.ll
new file mode 100644
index 00000000000..f55912e3740
--- /dev/null
+++ b/test/CodeGen/AMDGPU/jump-address.ll
@@ -0,0 +1,52 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: JUMP @6
+; CHECK: EXPORT
+; CHECK-NOT: EXPORT
+
+define void @main() #0 {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = bitcast float %1 to i32
+  %3 = icmp eq i32 %2, 0
+  %4 = sext i1 %3 to i32
+  %5 = bitcast i32 %4 to float
+  %6 = bitcast float %5 to i32
+  %7 = icmp ne i32 %6, 0
+  br i1 %7, label %ENDIF, label %ELSE
+
+ELSE:                                             ; preds = %main_body
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %9 = extractelement <4 x float> %8, i32 0
+  %10 = bitcast float %9 to i32
+  %11 = icmp eq i32 %10, 1
+  %12 = sext i1 %11 to i32
+  %13 = bitcast i32 %12 to float
+  %14 = bitcast float %13 to i32
+  %15 = icmp ne i32 %14, 0
+  br i1 %15, label %IF13, label %ENDIF
+
+ENDIF:                                            ; preds = %IF13, %ELSE, %main_body
+  %temp.0 = phi float [ 0xFFF8000000000000, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ]
+  %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ]
+  %temp2.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ]
+  %temp3.0 = phi float [ 5.000000e-01, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ]
+  %16 = insertelement <4 x float> undef, float %temp.0, i32 0
+  %17 = insertelement <4 x float> %16, float %temp1.0, i32 1
+  %18 = insertelement <4 x float> %17, float %temp2.0, i32 2
+  %19 = insertelement <4 x float> %18, float %temp3.0, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0)
+  ret void
+
+IF13:                                             ; preds = %ELSE
+  %20 = load <4 x float>, <4 x float> addrspace(8)* null
+  %21 = extractelement <4 x float> %20, i32 0
+  %22 = fsub float -0.000000e+00, %21
+  %23 = fadd float 0xFFF8000000000000, %22
+  br label %ENDIF
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/kcache-fold.ll b/test/CodeGen/AMDGPU/kcache-fold.ll
new file mode 100644
index 00000000000..7e2291cfdc3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/kcache-fold.ll
@@ -0,0 +1,100 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: {{^}}main1:
+; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}}
+define void @main1() {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %3 = extractelement <4 x float> %2, i32 0
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = fcmp ogt float %1, 0.000000e+00
+  %7 = select i1 %6, float %3, float %5
+  %8 = load <4 x float>, <4 x float> addrspace(8)* null
+  %9 = extractelement <4 x float> %8, i32 1
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %11 = extractelement <4 x float> %10, i32 1
+  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %13 = extractelement <4 x float> %12, i32 1
+  %14 = fcmp ogt float %9, 0.000000e+00
+  %15 = select i1 %14, float %11, float %13
+  %16 = load <4 x float>, <4 x float> addrspace(8)* null
+  %17 = extractelement <4 x float> %16, i32 2
+  %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %19 = extractelement <4 x float> %18, i32 2
+  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %21 = extractelement <4 x float> %20, i32 2
+  %22 = fcmp ogt float %17, 0.000000e+00
+  %23 = select i1 %22, float %19, float %21
+  %24 = load <4 x float>, <4 x float> addrspace(8)* null
+  %25 = extractelement <4 x float> %24, i32 3
+  %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %27 = extractelement <4 x float> %26, i32 3
+  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %29 = extractelement <4 x float> %28, i32 3
+  %30 = fcmp ogt float %25, 0.000000e+00
+  %31 = select i1 %30, float %27, float %29
+  %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
+  %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
+  %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
+  %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
+  %36 = insertelement <4 x float> undef, float %32, i32 0
+  %37 = insertelement <4 x float> %36, float %33, i32 1
+  %38 = insertelement <4 x float> %37, float %34, i32 2
+  %39 = insertelement <4 x float> %38, float %35, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
+  ret void
+}
+
+; CHECK: {{^}}main2:
+; CHECK-NOT: MOV
+define void @main2() {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %3 = extractelement <4 x float> %2, i32 0
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %5 = extractelement <4 x float> %4, i32 1
+  %6 = fcmp ogt float %1, 0.000000e+00
+  %7 = select i1 %6, float %3, float %5
+  %8 = load <4 x float>, <4 x float> addrspace(8)* null
+  %9 = extractelement <4 x float> %8, i32 1
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %11 = extractelement <4 x float> %10, i32 0
+  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %13 = extractelement <4 x float> %12, i32 1
+  %14 = fcmp ogt float %9, 0.000000e+00
+  %15 = select i1 %14, float %11, float %13
+  %16 = load <4 x float>, <4 x float> addrspace(8)* null
+  %17 = extractelement <4 x float> %16, i32 2
+  %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %19 = extractelement <4 x float> %18, i32 3
+  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %21 = extractelement <4 x float> %20, i32 2
+  %22 = fcmp ogt float %17, 0.000000e+00
+  %23 = select i1 %22, float %19, float %21
+  %24 = load <4 x float>, <4 x float> addrspace(8)* null
+  %25 = extractelement <4 x float> %24, i32 3
+  %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %27 = extractelement <4 x float> %26, i32 3
+  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %29 = extractelement <4 x float> %28, i32 2
+  %30 = fcmp ogt float %25, 0.000000e+00
+  %31 = select i1 %30, float %27, float %29
+  %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
+  %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
+  %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
+  %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
+  %36 = insertelement <4 x float> undef, float %32, i32 0
+  %37 = insertelement <4 x float> %36, float %33, i32 1
+  %38 = insertelement <4 x float> %37, float %34, i32 2
+  %39 = insertelement <4 x float> %38, float %35, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
+  ret void
+}
+
+declare float @llvm.AMDIL.clamp.(float, float, float) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
new file mode 100644
index 00000000000..1dd7c2cb799
--- /dev/null
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -0,0 +1,473 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+
+; FUNC-LABEL: {{^}}i8_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; GCN: buffer_load_ubyte
+
+define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
+entry:
+  %0 = zext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i8_zext_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
+
+define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
+entry:
+  %0 = zext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i8_sext_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
+
+define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
+entry:
+  %0 = sext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i16_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; GCN: buffer_load_ushort
+
+define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
+entry:
+  %0 = zext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i16_zext_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
+
+define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
+entry:
+  %0 = zext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i16_sext_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
+
+define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
+entry:
+  %0 = sext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_arg:
+; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
+define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
+entry:
+  store i32 %in, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_arg:
+; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
+define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
+entry:
+  store float %in, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v2i8_arg:
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
+entry:
+  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v2i16_arg:
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; GCN-DAG: buffer_load_ushort
+; GCN-DAG: buffer_load_ushort
+define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
+entry:
+  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v2i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
+; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
+; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
+define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
+entry:
+  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v2f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
+; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
+; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
+define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
+entry:
+  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v3i8_arg:
+; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
+; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
+; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
+define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
+entry:
+  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v3i16_arg:
+; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
+; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
+; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
+define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
+entry:
+  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+; FUNC-LABEL: {{^}}v3i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
+define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
+entry:
+  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v3f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
+define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
+entry:
+  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v4i8_arg:
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
+entry:
+  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v4i16_arg:
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
+entry:
+  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v4i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
+; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
+define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v4f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
+; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
+define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
+entry:
+  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v8i8_arg:
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
+entry:
+  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v8i16_arg:
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
+entry:
+  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v8i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
+; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
+define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
+entry:
+  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v8f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
+; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
+entry:
+  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v16i8_arg:
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
+entry:
+  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v16i16_arg:
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
+entry:
+  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v16i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
+; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
+entry:
+  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v16f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
+; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
+entry:
+  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}kernel_arg_i64:
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: buffer_store_dwordx2
+define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+  store i64 %a, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_kernel_arg:
+; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
+; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
+; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
+; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
+; GCN: buffer_store_dwordx2
+define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
+entry:
+  store double %in, double addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
+; XGCN: s_load_dwordx2
+; XGCN: s_load_dwordx2
+; XGCN: buffer_store_dwordx2
+; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
+;   ret void
+; }
diff --git a/test/CodeGen/AMDGPU/large-alloca.ll b/test/CodeGen/AMDGPU/large-alloca.ll
new file mode 100644
index 00000000000..671833d1a33
--- /dev/null
+++ b/test/CodeGen/AMDGPU/large-alloca.ll
@@ -0,0 +1,15 @@
+; XFAIL: *
+; REQUIRES: asserts
+; RUN: llc -march=amdgcn -mcpu=SI < %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s
+
+define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
+  store i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %0 = load i32, i32* %gep1
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/large-constant-initializer.ll b/test/CodeGen/AMDGPU/large-constant-initializer.ll
new file mode 100644
index 00000000000..9975b1b7f5c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/large-constant-initializer.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s
+; CHECK: s_endpgm
+
+@gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
+
+define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
+  %val = load i32, i32 addrspace(2)* getelementptr ([239 x i32], [239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4
+  %mul12 = mul nsw i32 %val, 7
+  br i1 undef, label %exit, label %bb
+
+bb:
+  %cmp = icmp slt i32 %x, 0
+  br label %exit
+
+exit:
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/lds-initializer.ll b/test/CodeGen/AMDGPU/lds-initializer.ll
new file mode 100644
index 00000000000..bf8df63be9f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-initializer.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported initializer for address space in load_init_lds_global
+
+@lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
+
+define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
+ %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10
+  %ld = load i32, i32 addrspace(3)* %gep
+  store i32 %ld, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/lds-oqap-crash.ll b/test/CodeGen/AMDGPU/lds-oqap-crash.ll
new file mode 100644
index 00000000000..6ff6fc3d7af
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-oqap-crash.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
+
+; The test is for a bug in R600EmitClauseMarkers.cpp where this pass
+; was searching for a use of the OQAP register in order to determine
+; if an LDS instruction could fit in the current clause, but never finding
+; one.  This created an infinite loop and hung the compiler.
+;
+; The LDS instruction should not have been defining OQAP in the first place,
+; because the LDS instructions are pseudo instructions and the OQAP
+; reads and writes are bundled together in the same instruction.
+
+; CHECK: {{^}}lds_crash:
+define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = load i32, i32 addrspace(3)* %in
+  ; This block needs to be > 115 ISA instructions to hit the bug,
+  ; so we'll use udiv instructions.
+  %div0 = udiv i32 %0, %b
+  %div1 = udiv i32 %div0, %a
+  %div2 = udiv i32 %div1, 11
+  %div3 = udiv i32 %div2, %a
+  %div4 = udiv i32 %div3, %b
+  %div5 = udiv i32 %div4, %c
+  %div6 = udiv i32 %div5, %div0
+  %div7 = udiv i32 %div6, %div1
+  store i32 %div7, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/lds-output-queue.ll b/test/CodeGen/AMDGPU/lds-output-queue.ll
new file mode 100644
index 00000000000..44ffc36af14
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-output-queue.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
+;
+; This test checks that the lds input queue will is empty at the end of
+; the ALU clause.
+
+; CHECK-LABEL: {{^}}lds_input_queue:
+; CHECK: LDS_READ_RET * OQAP
+; CHECK-NOT: ALU clause
+; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
+
+@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
+
+define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
+entry:
+  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
+  %1 = load i32, i32 addrspace(3)* %0
+  call void @llvm.AMDGPU.barrier.local()
+
+  ; This will start a new clause for the vertex fetch
+  %2 = load i32, i32 addrspace(1)* %in
+  %3 = add i32 %1, %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local()
+
+; The machine scheduler does not do proper alias analysis and assumes that
+; loads from global values (Note that a global value is different that a
+; value from global memory.  A global value is a value that is declared
+; outside of a function, it can reside in any address space) alias with
+; all other loads.
+;
+; This is a problem for scheduling the reads from the local data share (lds).
+; These reads are implemented using two instructions.  The first copies the
+; data from lds into the lds output queue, and the second moves the data from
+; the input queue into main memory.  These two instructions don't have to be
+; scheduled one after the other, but they do need to be scheduled in the same
+; clause.  The aliasing problem mentioned above causes problems when there is a
+; load from global memory which immediately follows a load from a global value that
+; has been declared in the local memory space:
+;
+;  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
+;  %1 = load i32, i32 addrspace(3)* %0
+;  %2 = load i32, i32 addrspace(1)* %in
+;
+; The instruction selection phase will generate ISA that looks like this:
+; %OQAP = LDS_READ_RET
+; %vreg0 = MOV %OQAP
+; %vreg1 = VTX_READ_32
+; %vreg2 = ADD_INT %vreg1, %vreg0
+;
+; The bottom scheduler will schedule the two ALU instructions first:
+;
+; UNSCHEDULED:
+; %OQAP = LDS_READ_RET
+; %vreg1 = VTX_READ_32
+;
+; SCHEDULED:
+;
+; vreg0 = MOV %OQAP
+; vreg2 = ADD_INT %vreg1, %vreg2
+;
+; The lack of proper aliasing results in the local memory read (LDS_READ_RET)
+; to consider the global memory read (VTX_READ_32) has a chain dependency, so
+; the global memory read will always be scheduled first.  This will give us a
+; final program which looks like this:
+;
+; Alu clause:
+; %OQAP = LDS_READ_RET
+; VTX clause:
+; %vreg1 = VTX_READ_32
+; Alu clause:
+; vreg0 = MOV %OQAP
+; vreg2 = ADD_INT %vreg1, %vreg2
+;
+; This is an illegal program because the OQAP def and use know occur in
+; different ALU clauses.
+;
+; This test checks this scenario and makes sure it doesn't result in an
+; illegal program.  For now, we have fixed this issue by merging the
+; LDS_READ_RET and MOV together during instruction selection and then
+; expanding them after scheduling.  Once the scheduler has better alias
+; analysis, we should be able to keep these instructions sparate before
+; scheduling.
+;
+; CHECK-LABEL: {{^}}local_global_alias:
+; CHECK: LDS_READ_RET
+; CHECK-NOT: ALU clause
+; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
+define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
+  %1 = load i32, i32 addrspace(3)* %0
+  %2 = load i32, i32 addrspace(1)* %in
+  %3 = add i32 %2, %1
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/lds-size.ll b/test/CodeGen/AMDGPU/lds-size.ll
new file mode 100644
index 00000000000..3e8328659fd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-size.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This test makes sure we do not double count global values when they are
+; used in different basic blocks.
+
+; CHECK: .long   166120
+; CHECK-NEXT: .long   1
+; CHECK-LABEL: {{^}}test:
+@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
+
+define void @test(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  br i1 %0, label %if, label %else
+
+if:
+  store i32 1, i32 addrspace(3)* @lds
+  br label %endif
+
+else:
+  store i32 2, i32 addrspace(3)* @lds
+  br label %endif
+
+endif:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/test/CodeGen/AMDGPU/lds-zero-initializer.ll
new file mode 100644
index 00000000000..fb51bc0e50c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported initializer for address space in load_zeroinit_lds_global
+
+@lds = addrspace(3) global [256 x i32] zeroinitializer
+
+define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
+ %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10
+  %ld = load i32, i32 addrspace(3)* %gep
+  store i32 %ld, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll b/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
new file mode 100644
index 00000000000..4244c48d240
--- /dev/null
+++ b/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This tests a bug where LegalizeDAG was not checking the target's
+; BooleanContents value and always using one for true, when expanding
+; setcc to select_cc.
+;
+; This bug caused the icmp IR instruction to be expanded to two machine
+; instructions, when only one is needed.
+;
+
+; CHECK: {{^}}setcc_expand:
+; CHECK: SET
+; CHECK-NOT: CND
+define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp eq i32 %in, 5
+  br i1 %0, label %IF, label %ENDIF
+IF:
+  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  store i32 0, i32 addrspace(1)* %1
+  br label %ENDIF
+
+ENDIF:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/lit.local.cfg b/test/CodeGen/AMDGPU/lit.local.cfg
new file mode 100644
index 00000000000..2a665f06be7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/test/CodeGen/AMDGPU/literals.ll b/test/CodeGen/AMDGPU/literals.ll
new file mode 100644
index 00000000000..cff1c24f89d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/literals.ll
@@ -0,0 +1,64 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Test using an integer literal constant.
+; Generated ASM should be:
+; ADD_INT KC0[2].Z literal.x, 5
+; or
+; ADD_INT literal.x KC0[2].Z, 5
+
+; CHECK: {{^}}i32_literal:
+; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 5
+define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = add i32 5, %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test using a float literal constant.
+; Generated ASM should be:
+; ADD KC0[2].Z literal.x, 5.0
+; or
+; ADD literal.x KC0[2].Z, 5.0
+
+; CHECK: {{^}}float_literal:
+; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.0
+define void @float_literal(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fadd float 5.0, %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; Make sure inline literals are folded into REG_SEQUENCE instructions.
+; CHECK: {{^}}inline_literal_reg_sequence:
+; CHECK: MOV {{\** *}}T[[GPR:[0-9]]].X, 0.0
+; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Y, 0.0
+; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0
+; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0
+
+define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) {
+entry:
+  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}inline_literal_dot4:
+; CHECK: DOT4 T[[GPR:[0-9]]].X, 1.0
+; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0
+; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0
+; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0
+define void @inline_literal_dot4(float addrspace(1)* %out) {
+entry:
+  %0 = call float @llvm.AMDGPU.dp4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
new file mode 100644
index 00000000000..8bf094b8bc7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone
+
+; Legacy name
+declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}s_abs_i32:
+; SI: s_sub_i32
+; SI: s_max_i32
+; SI: s_endpgm
+
+; EG: SUB_INT
+; EG: MAX_INT
+define void @s_abs_i32(i32 addrspace(1)* %out, i32 %src) nounwind {
+  %abs = call i32 @llvm.AMDGPU.abs(i32 %src) nounwind readnone
+  store i32 %abs, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_abs_i32:
+; SI: v_sub_i32_e32
+; SI: v_max_i32_e32
+; SI: s_endpgm
+
+; EG: SUB_INT
+; EG: MAX_INT
+define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+  %val = load i32, i32 addrspace(1)* %src, align 4
+  %abs = call i32 @llvm.AMDGPU.abs(i32 %val) nounwind readnone
+  store i32 %abs, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}abs_i32_legacy_amdil:
+; SI: v_sub_i32_e32
+; SI: v_max_i32_e32
+; SI: s_endpgm
+
+; EG: SUB_INT
+; EG: MAX_INT
+define void @abs_i32_legacy_amdil(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+  %val = load i32, i32 addrspace(1)* %src, align 4
+  %abs = call i32 @llvm.AMDIL.abs.i32(i32 %val) nounwind readnone
+  store i32 %abs, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll
new file mode 100644
index 00000000000..db883972d64
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_barrier_global:
+; EG: GROUP_BARRIER
+; SI: buffer_store_dword
+; SI: s_waitcnt
+; SI: s_barrier
+
+define void @test_barrier_global(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x()
+  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
+  store i32 %0, i32 addrspace(1)* %1
+  call void @llvm.AMDGPU.barrier.global()
+  %2 = call i32 @llvm.r600.read.local.size.x()
+  %3 = sub i32 %2, 1
+  %4 = sub i32 %3, %0
+  %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
+  %6 = load i32, i32 addrspace(1)* %5
+  store i32 %6, i32 addrspace(1)* %1
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.global()
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.r600.read.local.size.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll
new file mode 100644
index 00000000000..48fb2e0b1a8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_barrier_local:
+; EG: GROUP_BARRIER
+
+; SI: buffer_store_dword
+; SI: s_waitcnt
+; SI: s_barrier
+
+define void @test_barrier_local(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x()
+  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
+  store i32 %0, i32 addrspace(1)* %1
+  call void @llvm.AMDGPU.barrier.local()
+  %2 = call i32 @llvm.r600.read.local.size.x()
+  %3 = sub i32 %2, 1
+  %4 = sub i32 %3, %0
+  %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
+  %6 = load i32, i32 addrspace(1)* %5
+  store i32 %6, i32 addrspace(1)* %1
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local()
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.r600.read.local.size.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
new file mode 100644
index 00000000000..1168713ca66
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
@@ -0,0 +1,437 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}bfe_i32_arg_arg_arg:
+; SI: v_bfe_i32
+; EG: BFE_INT
+; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac
+define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm:
+; SI: v_bfe_i32
+; EG: BFE_INT
+define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg:
+; SI: v_bfe_i32
+; EG: BFE_INT
+define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg:
+; SI: v_bfe_i32
+; EG: BFE_INT
+define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_bfe_print_arg:
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
+define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind {
+  %load = load i32, i32 addrspace(1)* %src0, align 4
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_6:
+; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: s_endpgm
+define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_7:
+; SI-NOT: shl
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_8:
+; SI: buffer_load_dword
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; SI: s_endpgm
+define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_13:
+; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = ashr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_14:
+; SI-NOT: lshr
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = lshr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_0:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_1:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_2:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_3:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_4:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_5:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_6:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_7:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_8:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -6
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_13:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_14:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_15:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_16:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_17:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_18:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_sext_in_reg_i24:
+; SI: buffer_load_dword [[LOAD:v[0-9]+]],
+; SI-NOT: v_lshl
+; SI-NOT: v_ashr
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24
+; SI: buffer_store_dword [[BFE]],
+define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24)
+  %shl = shl i32 %bfe, 8
+  %ashr = ashr i32 %shl, 8
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @simplify_demanded_bfe_sdiv
+; SI: buffer_load_dword [[LOAD:v[0-9]+]]
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16
+; SI: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]]
+; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], [[TMP0]], [[BFE]]
+; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
+; SI: buffer_store_dword [[TMP2]]
+define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %src = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone
+  %div = sdiv i32 %bfe, 2
+  store i32 %div, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
new file mode 100644
index 00000000000..541119242a9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
@@ -0,0 +1,627 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg:
+; SI: v_bfe_u32
+; EG: BFE_UINT
+define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm:
+; SI: v_bfe_u32
+; EG: BFE_UINT
+define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg:
+; SI: v_bfe_u32
+; EG: BFE_UINT
+define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg:
+; SI: v_bfe_u32
+; EG: BFE_UINT
+define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_zextload_i8:
+; SI: buffer_load_ubyte
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %load = load i8, i8 addrspace(1)* %in
+  %ext = zext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 65535
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI: bfe
+; SI: s_endpgm
+define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8
+; SI-NEXT: bfe
+; SI: s_endpgm
+define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80
+; SI-NEXT: bfe
+; SI: s_endpgm
+define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: bfe
+; SI: s_endpgm
+define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 65535
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_1:
+; SI: buffer_load_dword
+; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI: s_endpgm
+; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1,
+define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_4:
+; SI-NOT: lshl
+; SI-NOT: shr
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = lshr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_5:
+; SI: buffer_load_dword
+; SI-NOT: lshl
+; SI-NOT: shr
+; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
+; SI: s_endpgm
+define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_6:
+; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: s_endpgm
+define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_7:
+; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_8:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_13:
+; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = ashr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_14:
+; SI-NOT: lshr
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = lshr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_0:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_1:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_2:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_3:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_4:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_5:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_6:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_7:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_8:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_13:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_14:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_15:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_16:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_17:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_18:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Make sure that SimplifyDemandedBits doesn't cause the and to be
+; reduced to the bits demanded by the bfe.
+
+; XXX: The operand to v_bfe_u32 could also just directly be the load register.
+; FUNC-LABEL: {{^}}simplify_bfe_u32_multi_use_arg:
+; SI: buffer_load_dword [[ARG:v[0-9]+]]
+; SI: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]]
+; SI: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2
+; SI-DAG: buffer_store_dword [[AND]]
+; SI-DAG: buffer_store_dword [[BFE]]
+; SI: s_endpgm
+define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
+                                            i32 addrspace(1)* %out1,
+                                            i32 addrspace(1)* %in) nounwind {
+  %src = load i32, i32 addrspace(1)* %in, align 4
+  %and = and i32 %src, 63
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %and, i32 2, i32 2) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4
+  store i32 %and, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lshr_and:
+; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; SI: buffer_store_dword
+define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %b = lshr i32 %a, 6
+  %c = and i32 %b, 7
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_lshr_and:
+; SI: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3
+; SI: buffer_store_dword
+define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = lshr i32 %a, %b
+  %d = and i32 %c, 7
+  store i32 %d, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}and_lshr:
+; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; SI: buffer_store_dword
+define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %b = and i32 %a, 448
+  %c = lshr i32 %b, 6
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}and_lshr2:
+; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; SI: buffer_store_dword
+define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %b = and i32 %a, 511
+  %c = lshr i32 %b, 6
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}shl_lshr:
+; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002
+; SI: buffer_store_dword
+define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %b = shl i32 %a, 9
+  %c = lshr i32 %b, 11
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll
new file mode 100644
index 00000000000..517a55abc09
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.bfi(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}bfi_arg_arg_arg:
+; SI: v_bfi_b32
+; EG: BFI_INT
+define void @bfi_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
+  store i32 %bfi, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfi_arg_arg_imm:
+; SI: v_bfi_b32
+; EG: BFI_INT
+define void @bfi_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 123) nounwind readnone
+  store i32 %bfi, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfi_arg_imm_arg:
+; SI: v_bfi_b32
+; EG: BFI_INT
+define void @bfi_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
+  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 123, i32 %src2) nounwind readnone
+  store i32 %bfi, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfi_imm_arg_arg:
+; SI: v_bfi_b32
+; EG: BFI_INT
+define void @bfi_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
+  %bfi = call i32 @llvm.AMDGPU.bfi(i32 123, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %bfi, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll
new file mode 100644
index 00000000000..50492289d74
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.bfm(i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}bfm_arg_arg:
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; EG: BFM_INT
+define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 %src1) nounwind readnone
+  store i32 %bfm, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfm_arg_imm:
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x7b
+; EG: BFM_INT
+define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind {
+  %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 123) nounwind readnone
+  store i32 %bfm, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfm_imm_arg:
+; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, {{s[0-9]+}}
+; EG: BFM_INT
+define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind {
+  %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 %src1) nounwind readnone
+  store i32 %bfm, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfm_imm_imm:
+; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, 0x1c8
+; EG: BFM_INT
+define void @bfm_imm_imm(i32 addrspace(1)* %out) nounwind {
+  %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 456) nounwind readnone
+  store i32 %bfm, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfm_pattern:
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+  %a = shl i32 1, %x
+  %b = sub i32 %a, 1
+  %c = shl i32 %b, %y
+  store i32 %c, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfm_pattern_simple:
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
+define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) {
+  %a = shl i32 1, %x
+  %b = sub i32 %a, 1
+  store i32 %b, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll
new file mode 100644
index 00000000000..301de4b1c82
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}s_brev_i32:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_brev_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr, align 4
+  %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
new file mode 100644
index 00000000000..11ec963ab31
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.fabs.f32(float) nounwind readnone
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone
+declare float @llvm.AMDIL.clamp.f32(float, float, float) nounwind readnone
+
+; FUNC-LABEL: {{^}}clamp_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+
+; EG: MOV_SAT
+define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, |[[ARG]]| clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fabs, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -[[ARG]] clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %src.fneg = fsub float -0.0, %src
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -|[[ARG]]| clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone
+  %src.fneg.fabs = fsub float -0.0, %src.fabs
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg.fabs, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}clamp_0_1_amdil_legacy_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+define void @clamp_0_1_amdil_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
+  %clamp = call float @llvm.AMDIL.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll
new file mode 100644
index 00000000000..805a88b59c7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll
@@ -0,0 +1,497 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i1 @llvm.AMDGPU.class.f32(float, i32) #1
+declare i1 @llvm.AMDGPU.class.f64(double, i32) #1
+declare i32 @llvm.r600.read.tidig.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare double @llvm.fabs.f64(double) #1
+
+; SI-LABEL: {{^}}test_class_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fabs_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+  %a.fabs = call float @llvm.fabs.f32(float %a) #1
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fabs, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fneg_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+  %a.fneg = fsub float -0.0, %a
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fneg_fabs_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+  %a.fabs = call float @llvm.fabs.f32(float %a) #1
+  %a.fneg.fabs = fsub float -0.0, %a.fabs
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg.fabs, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_1_f32:
+; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}}
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_64_f32:
+; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}}
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Set all 10 bits of mask
+; SI-LABEL: {{^}}test_class_full_mask_f32:
+; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
+; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1023) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_9bit_mask_f32:
+; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
+; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}v_test_class_full_mask_f32:
+; SI-DAG: buffer_load_dword [[VA:v[0-9]+]]
+; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
+; SI: v_cmp_class_f32_e32 vcc, [[VA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32:
+; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
+; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %b = load i32, i32 addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f32(float 1.0, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; FIXME: Why isn't this using a literal constant operand?
+; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32:
+; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
+; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
+; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %b = load i32, i32 addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f32(float 1024.0, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_f64:
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fabs_f64:
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+  %a.fabs = call double @llvm.fabs.f64(double %a) #1
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fabs, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fneg_f64:
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+  %a.fneg = fsub double -0.0, %a
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fneg_fabs_f64:
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+  %a.fabs = call double @llvm.fabs.f64(double %a) #1
+  %a.fneg.fabs = fsub double -0.0, %a.fabs
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg.fabs, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_1_f64:
+; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
+; SI: s_endpgm
+define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 1) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_64_f64:
+; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
+; SI: s_endpgm
+define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 64) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Set all 9 bits of mask
+; SI-LABEL: {{^}}test_class_full_mask_f64:
+; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
+; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}v_test_class_full_mask_f64:
+; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
+; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %in
+
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64:
+; XSI: v_cmp_class_f64_e32 vcc, 1.0,
+; SI: v_cmp_class_f64_e32 vcc,
+; SI: s_endpgm
+define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %b = load i32, i32 addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f64(double 1.0, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64:
+; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+; SI: s_endpgm
+define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %b = load i32, i32 addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f64(double 1024.0, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or3_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
+  %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %or.0 = or i1 %class0, %class1
+  %or.1 = or i1 %or.0, %class2
+
+  %sext = sext i1 %or.1 to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
+; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
+  %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
+  %class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1
+  %class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1
+  %class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
+  %class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1
+  %class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1
+  %class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1
+  %or.0 = or i1 %class0, %class1
+  %or.1 = or i1 %or.0, %class2
+  %or.2 = or i1 %or.1, %class3
+  %or.3 = or i1 %or.2, %class4
+  %or.4 = or i1 %or.3, %class5
+  %or.5 = or i1 %or.4, %class6
+  %or.6 = or i1 %or.5, %class7
+  %or.7 = or i1 %or.6, %class8
+  %or.8 = or i1 %or.7, %class9
+  %sext = sext i1 %or.8 to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_class_f32_1:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_class_f32_2:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_no_fold_or_class_f32_0:
+; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}}
+; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
+; SI: s_or_b64
+; SI: s_endpgm
+define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_0_f32:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_0_f64:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
new file mode 100644
index 00000000000..e95a51093cb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
@@ -0,0 +1,59 @@
+
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: {{^}}cube:
+; CHECK: CUBE T{{[0-9]}}.X
+; CHECK: CUBE T{{[0-9]}}.Y
+; CHECK: CUBE T{{[0-9]}}.Z
+; CHECK: CUBE * T{{[0-9]}}.W
+define void @cube() #0 {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %1 = extractelement <4 x float> %0, i32 3
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %3 = extractelement <4 x float> %2, i32 0
+  %4 = fdiv float %3, %1
+  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %6 = extractelement <4 x float> %5, i32 1
+  %7 = fdiv float %6, %1
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %9 = extractelement <4 x float> %8, i32 2
+  %10 = fdiv float %9, %1
+  %11 = insertelement <4 x float> undef, float %4, i32 0
+  %12 = insertelement <4 x float> %11, float %7, i32 1
+  %13 = insertelement <4 x float> %12, float %10, i32 2
+  %14 = insertelement <4 x float> %13, float 1.000000e+00, i32 3
+  %15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %14)
+  %16 = extractelement <4 x float> %15, i32 0
+  %17 = extractelement <4 x float> %15, i32 1
+  %18 = extractelement <4 x float> %15, i32 2
+  %19 = extractelement <4 x float> %15, i32 3
+  %20 = call float @fabs(float %18)
+  %21 = fdiv float 1.000000e+00, %20
+  %22 = fmul float %16, %21
+  %23 = fadd float %22, 1.500000e+00
+  %24 = fmul float %17, %21
+  %25 = fadd float %24, 1.500000e+00
+  %26 = insertelement <4 x float> undef, float %25, i32 0
+  %27 = insertelement <4 x float> %26, float %23, i32 1
+  %28 = insertelement <4 x float> %27, float %19, i32 2
+  %29 = insertelement <4 x float> %28, float %25, i32 3
+  %30 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %29, i32 16, i32 0, i32 4)
+  call void @llvm.R600.store.swizzle(<4 x float> %30, i32 0, i32 0)
+  ret void
+}
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
+
+; Function Attrs: readnone
+declare float @fabs(float) #1
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { readnone }
+
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll
new file mode 100644
index 00000000000..8b32f696449
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll
@@ -0,0 +1,43 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.AMDGPU.cvt.f32.ubyte0(i32) nounwind readnone
+declare float @llvm.AMDGPU.cvt.f32.ubyte1(i32) nounwind readnone
+declare float @llvm.AMDGPU.cvt.f32.ubyte2(i32) nounwind readnone
+declare float @llvm.AMDGPU.cvt.f32.ubyte3(i32) nounwind readnone
+
+; SI-LABEL: {{^}}test_unpack_byte0_to_float:
+; SI: v_cvt_f32_ubyte0
+define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte0(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_unpack_byte1_to_float:
+; SI: v_cvt_f32_ubyte1
+define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte1(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_unpack_byte2_to_float:
+; SI: v_cvt_f32_ubyte2
+define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte2(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_unpack_byte3_to_float:
+; SI: v_cvt_f32_ubyte3
+define void @test_unpack_byte3_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte3(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll
new file mode 100644
index 00000000000..55ca9c7536e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone
+declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone
+
+; GCN-LABEL: {{^}}test_div_fixup_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+  %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fixup_f64:
+; GCN: v_div_fixup_f64
+define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
+  %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll
new file mode 100644
index 00000000000..bcb7f870f1f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll
@@ -0,0 +1,179 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; FIXME: Enable for VI.
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
+declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone
+declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone
+
+; GCN-LABEL: {{^}}test_div_fmas_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0:
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VA]], [[VC]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f64:
+; GCN: v_div_fmas_f64
+define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
+  %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
+; SI: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}}
+; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
+  %cmp = icmp eq i32 %i, 0
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
+; SI: s_mov_b64 vcc, 0
+; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
+; SI: s_mov_b64 vcc, -1
+; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-DAG: v_cmp_eq_i32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}}
+; SI-DAG: v_cmp_ne_i32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}}
+; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
+; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
+; SI: s_endpgm
+define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
+  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
+
+  %a = load float, float addrspace(1)* %gep.a
+  %b = load float, float addrspace(1)* %gep.b
+  %c = load float, float addrspace(1)* %gep.c
+
+  %cmp0 = icmp eq i32 %tid, 0
+  %cmp1 = icmp ne i32 %d, 0
+  %and = and i1 %cmp0, %cmp1
+
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone
+  store float %result, float addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
+; SI: v_cmp_eq_i32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]]
+
+; SI: buffer_load_dword [[LOAD:v[0-9]+]]
+; SI: v_cmp_ne_i32_e32 vcc, 0, [[LOAD]]
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+
+
+; SI: BB9_2:
+; SI: s_or_b64 exec, exec, [[SAVE]]
+; SI: v_cmp_ne_i32_e32 vcc, 0, v0
+; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
+entry:
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
+  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
+  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
+
+  %a = load float, float addrspace(1)* %gep.a
+  %b = load float, float addrspace(1)* %gep.b
+  %c = load float, float addrspace(1)* %gep.c
+
+  %cmp0 = icmp eq i32 %tid, 0
+  br i1 %cmp0, label %bb, label %exit
+
+bb:
+  %val = load i32, i32 addrspace(1)* %dummy
+  %cmp1 = icmp ne i32 %val, 0
+  br label %exit
+
+exit:
+  %cond = phi i1 [false, %entry], [%cmp1, %bb]
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone
+  store float %result, float addrspace(1)* %gep.out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll
new file mode 100644
index 00000000000..de830de039c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll
@@ -0,0 +1,364 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone
+declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; SI-LABEL @test_div_scale_f32_1:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_2:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_1:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_1:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_num_1:
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
+; SI-DAG: s_load_dword [[A:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+
+  %b = load float, float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_num_2:
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
+; SI-DAG: s_load_dword [[A:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+
+  %b = load float, float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_den_1:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
+; SI-DAG: s_load_dword [[B:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_den_2:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
+; SI-DAG: s_load_dword [[B:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_num_1:
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+
+  %b = load double, double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_num_2:
+; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+
+  %b = load double, double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_den_1:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_den_2:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_all_scalar_1:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_all_scalar_2:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_all_scalar_1:
+; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
+; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]]
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_all_scalar_2:
+; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
+; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]]
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_inline_imm_num:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %a = load float, float addrspace(1)* %gep.0, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_inline_imm_den:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %a = load float, float addrspace(1)* %gep.0, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_fabs_num:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]|
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_fabs_den:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.flbit.i32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.flbit.i32.ll
new file mode 100644
index 00000000000..20c7af8ade5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.flbit.i32.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.flbit.i32(i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}s_flbit:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone
+  store i32 %r, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_flbit:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr, align 4
+  %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone
+  store i32 %r, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll
new file mode 100644
index 00000000000..e098dd35d6d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+
+declare double @llvm.fabs.f64(double %Val)
+declare double @llvm.AMDGPU.fract.f64(double) nounwind readnone
+
+; FUNC-LABEL: {{^}}fract_f64:
+; GCN: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
+; CI: buffer_store_dwordx2 [[FRC]]
+define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
+  %val = load double, double addrspace(1)* %src, align 4
+  %fract = call double @llvm.AMDGPU.fract.f64(double %val) nounwind readnone
+  store double %fract, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f64_neg:
+; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
+; CI: buffer_store_dwordx2 [[FRC]]
+define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
+  %val = load double, double addrspace(1)* %src, align 4
+  %neg = fsub double 0.0, %val
+  %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
+  store double %fract, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f64_neg_abs:
+; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]|
+; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
+; CI: buffer_store_dwordx2 [[FRC]]
+define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
+  %val = load double, double addrspace(1)* %src, align 4
+  %abs = call double @llvm.fabs.f64(double %val)
+  %neg = fsub double 0.0, %abs
+  %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
+  store double %fract, double addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll
new file mode 100644
index 00000000000..7501b4b7546
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.fabs.f32(float  %Val)
+declare float @llvm.AMDGPU.fract.f32(float) nounwind readnone
+
+; Legacy name
+declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone
+
+; FUNC-LABEL: {{^}}fract_f32:
+; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
+; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
+; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
+; GCN: buffer_store_dword [[RESULT]]
+; EG: FRACT
+define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
+  %val = load float, float addrspace(1)* %src, align 4
+  %fract = call float @llvm.AMDGPU.fract.f32(float %val) nounwind readnone
+  store float %fract, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f32_legacy_amdil:
+; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
+; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
+; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
+; GCN: buffer_store_dword [[RESULT]]
+; EG: FRACT
+define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
+  %val = load float, float addrspace(1)* %src, align 4
+  %fract = call float @llvm.AMDIL.fraction.f32(float %val) nounwind readnone
+  store float %fract, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f32_neg:
+; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]]
+; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -[[INPUT:v[0-9]+]]
+; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT]], [[FLR]]
+; GCN: buffer_store_dword [[RESULT]]
+; EG: FRACT
+define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
+  %val = load float, float addrspace(1)* %src, align 4
+  %neg = fsub float 0.0, %val
+  %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone
+  store float %fract, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f32_neg_abs:
+; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
+; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
+; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT]]|, [[FLR]]
+; GCN: buffer_store_dword [[RESULT]]
+; EG: FRACT
+define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
+  %val = load float, float addrspace(1)* %src, align 4
+  %abs = call float @llvm.fabs.f32(float %val)
+  %neg = fsub float 0.0, %abs
+  %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone
+  store float %fract, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll
new file mode 100644
index 00000000000..42102e30f07
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FIXME: Store of i32 seems to be broken pre-EG somehow?
+
+declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_imad24:
+; SI: v_mad_i32_i24
+; CM: MULADD_INT24
+; R600: MULLO_INT
+; R600: ADD_INT
+define void @test_imad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %mad = call i32 @llvm.AMDGPU.imad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll
new file mode 100644
index 00000000000..46662f96c29
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}vector_imax:
+; SI: v_max_i32_e32
+define void @vector_imax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
+main_body:
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %load)
+  %bc = bitcast i32 %max to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: {{^}}scalar_imax:
+; SI: s_max_i32
+define void @scalar_imax(i32 %p0, i32 %p1) #0 {
+entry:
+  %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %p1)
+  %bc = bitcast i32 %max to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; Function Attrs: readnone
+declare i32 @llvm.AMDGPU.imax(i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll
new file mode 100644
index 00000000000..34b454e2375
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}vector_imin:
+; SI: v_min_i32_e32
+define void @vector_imin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
+main_body:
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %load)
+  %bc = bitcast i32 %min to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: {{^}}scalar_imin:
+; SI: s_min_i32
+define void @scalar_imin(i32 %p0, i32 %p1) #0 {
+entry:
+  %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %p1)
+  %bc = bitcast i32 %min to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; Function Attrs: readnone
+declare i32 @llvm.AMDGPU.imin(i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll
new file mode 100644
index 00000000000..fdc1172260b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_imul24:
+; SI: v_mul_i32_i24
+; CM: MUL_INT24
+; R600: MULLO_INT
+define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
new file mode 100644
index 00000000000..057708e7b5c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}kill_gs_const:
+; SI-NOT: v_cmpx_le_f32
+; SI: s_mov_b64 exec, 0
+
+define void @kill_gs_const() #0 {
+main_body:
+  %0 = icmp ule i32 0, 3
+  %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kill(float %1)
+  %2 = icmp ule i32 3, 0
+  %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kill(float %3)
+  ret void
+}
+
+; SI-LABEL: {{^}}kill_vcc_implicit_def:
+; SI-NOT: v_cmp_gt_f32_e32 vcc,
+; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
+; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
+define void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #1 {
+entry:
+  %tmp0 = fcmp olt float %13, 0.0
+  call void @llvm.AMDGPU.kill(float %14)
+  %tmp1 = select i1 %tmp0, float 1.0, float 0.0
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1)
+  ret void
+}
+
+declare void @llvm.AMDGPU.kill(float)
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="2" }
+attributes #1 = { "ShaderType"="0" }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll
new file mode 100644
index 00000000000..a59c0ce6d67
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.AMDGPU.ldexp.f32(float, i32) nounwind readnone
+declare double @llvm.AMDGPU.ldexp.f64(double, i32) nounwind readnone
+
+; SI-LABEL: {{^}}test_ldexp_f32:
+; SI: v_ldexp_f32
+; SI: s_endpgm
+define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
+  %result = call float @llvm.AMDGPU.ldexp.f32(float %a, i32 %b) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_ldexp_f64:
+; SI: v_ldexp_f64
+; SI: s_endpgm
+define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
+  %result = call double @llvm.AMDGPU.ldexp.f64(double %a, i32 %b) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll
new file mode 100644
index 00000000000..4cafd563685
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone
+
+; FUNC-LABEL: {{^}}rsq_legacy_f32:
+; SI: v_rsq_legacy_f32_e32
+; EG: RECIPSQRT_IEEE
+define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll
new file mode 100644
index 00000000000..83b56a5029d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll
@@ -0,0 +1,17 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1)
+   %vec = insertelement <4 x float> undef, float %r2, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare float @llvm.AMDGPU.mul(float ,float ) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll
new file mode 100644
index 00000000000..d2a655bf909
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+; FUNC-LABEL: {{^}}rcp_f64:
+; SI: v_rcp_f64_e32
+define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind {
+  %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_pat_f64:
+; SI: v_rcp_f64_e32
+define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
+  %rcp = fdiv double 1.0, %src
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_rcp_pat_f64:
+; SI-UNSAFE: v_rsq_f64_e32
+; SI-SAFE-NOT: v_rsq_f64_e32
+; SI-SAFE: v_sqrt_f64
+; SI-SAFE: v_rcp_f64
+define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
+  %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone
+  %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll
new file mode 100644
index 00000000000..edd6e9a72f1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
+
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone
+declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
+
+declare float @llvm.sqrt.f32(float) nounwind readnone
+
+; FUNC-LABEL: {{^}}rcp_f32:
+; SI: v_rcp_f32_e32
+; EG: RECIP_IEEE
+define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: Evergreen only ever does unsafe fp math.
+; FUNC-LABEL: {{^}}rcp_pat_f32:
+
+; SI-SAFE: v_rcp_f32_e32
+; XSI-SAFE-SPDENORM-NOT: v_rcp_f32_e32
+
+; EG: RECIP_IEEE
+
+define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rcp = fdiv float 1.0, %src
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_rcp_pat_f32:
+; SI-UNSAFE: v_rsq_f32_e32
+; SI-SAFE: v_sqrt_f32_e32
+; SI-SAFE: v_rcp_f32_e32
+
+; EG: RECIPSQRT_IEEE
+define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
+  %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone
+  %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll
new file mode 100644
index 00000000000..67f1d22c717
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+
+declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
+
+; FUNC-LABEL: {{^}}rsq_clamped_f64:
+; SI: v_rsq_clamp_f64_e32
+
+; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3]
+; TODO: this constant should be folded:
+; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
+; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
+; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
+; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
+; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
+; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
+; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
+
+define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
+  %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
+  store double %rsq_clamped, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll
new file mode 100644
index 00000000000..eeff2536b23
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone
+
+; FUNC-LABEL: {{^}}rsq_clamped_f32:
+; SI: v_rsq_clamp_f32_e32
+
+; VI: v_rsq_f32_e32 [[RSQ:v[0-9]+]], {{s[0-9]+}}
+; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
+; TODO: this constant should be folded:
+; VI: v_mov_b32_e32 [[MINFLT:v[0-9]+]], 0xff7fffff
+; VI: v_max_f32_e32 {{v[0-9]+}}, [[MIN]], [[MINFLT]]
+
+; EG: RECIPSQRT_CLAMPED
+
+define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone
+  store float %rsq_clamped, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll
new file mode 100644
index 00000000000..36b72f14db1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone
+
+; FUNC-LABEL: {{^}}rsq_f32:
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; EG: RECIPSQRT_IEEE
+define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; TODO: Really these should be constant folded
+; FUNC-LABEL: {{^}}rsq_f32_constant_4.0
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0
+; EG: RECIPSQRT_IEEE
+define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind {
+  %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_f32_constant_100.0
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000
+; EG: RECIPSQRT_IEEE
+define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind {
+  %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll
new file mode 100644
index 00000000000..10206609bb5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll
@@ -0,0 +1,42 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+   %addr = load <4 x float>, <4 x float> addrspace(1)* %in
+   %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %addr, i32 0, i32 0, i32 1)
+   %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res1, i32 0, i32 0, i32 2)
+   %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res2, i32 0, i32 0, i32 3)
+   %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res3, i32 0, i32 0, i32 4)
+   %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res4, i32 0, i32 0, i32 5)
+   %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res5, i32 0, i32 0, i32 6)
+   %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res6, i32 0, i32 0, i32 7)
+   %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res7, i32 0, i32 0, i32 8)
+   %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res8, i32 0, i32 0, i32 9)
+   %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res9, i32 0, i32 0, i32 10)
+   %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res10, i32 0, i32 0, i32 11)
+   %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res11, i32 0, i32 0, i32 12)
+   %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res12, i32 0, i32 0, i32 13)
+   %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res13, i32 0, i32 0, i32 14)
+   %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res14, i32 0, i32 0, i32 15)
+   %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res15, i32 0, i32 0, i32 16)
+   store <4 x float> %res16, <4 x float> addrspace(1)* %out
+   ret void
+}
+
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll
new file mode 100644
index 00000000000..6b546a7e17c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone
+
+; SI-LABEL: {{^}}test_trig_preop_f64:
+; SI-DAG: buffer_load_dword [[SEG:v[0-9]+]]
+; SI-DAG: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]],
+; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
+define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load double, double addrspace(1)* %aptr, align 8
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}test_trig_preop_f64_imm_segment:
+; SI: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]],
+; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
+define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
+  %a = load double, double addrspace(1)* %aptr, align 8
+  %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
new file mode 100644
index 00000000000..74792e50017
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+; R600: {{^}}amdgpu_trunc:
+; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: {{^}}amdgpu_trunc:
+; SI: v_trunc_f32
+
+define void @amdgpu_trunc(float addrspace(1)* %out, float %x) {
+entry:
+  %0 = call float @llvm.AMDGPU.trunc(float %x)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.AMDGPU.trunc(float ) readnone
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll
new file mode 100644
index 00000000000..77a073b0cb0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: {{^}}test_umad24:
+; SI: v_mad_u32_u24
+; EG: MULADD_UINT24
+; R600: MULLO_UINT
+; R600: ADD_INT
+define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}commute_umad24:
+; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]]
+; SI: buffer_store_dword [[RESULT]]
+define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %src0.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %src2.gep = getelementptr i32, i32 addrspace(1)* %src0.gep, i32 1
+
+  %src0 = load i32, i32 addrspace(1)* %src0.gep, align 4
+  %src2 = load i32, i32 addrspace(1)* %src2.gep, align 4
+  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll
new file mode 100644
index 00000000000..a97d103016d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}vector_umax:
+; SI: v_max_u32_e32
+define void @vector_umax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
+main_body:
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %load)
+  %bc = bitcast i32 %max to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: {{^}}scalar_umax:
+; SI: s_max_u32
+define void @scalar_umax(i32 %p0, i32 %p1) #0 {
+entry:
+  %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %p1)
+  %bc = bitcast i32 %max to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: {{^}}trunc_zext_umax:
+; SI: buffer_load_ubyte [[VREG:v[0-9]+]],
+; SI: v_max_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
+; SI-NOT: and
+; SI: buffer_store_short [[RESULT]],
+define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
+  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
+  %tmp2 = zext i8 %tmp5 to i32
+  %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone
+  %tmp4 = trunc i32 %tmp3 to i8
+  %tmp6 = zext i8 %tmp4 to i16
+  store i16 %tmp6, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; Function Attrs: readnone
+declare i32 @llvm.AMDGPU.umax(i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll
new file mode 100644
index 00000000000..2acd10e0c63
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}vector_umin:
+; SI: v_min_u32_e32
+define void @vector_umin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
+main_body:
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %load)
+  %bc = bitcast i32 %min to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: {{^}}scalar_umin:
+; SI: s_min_u32
+define void @scalar_umin(i32 %p0, i32 %p1) #0 {
+entry:
+  %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %p1)
+  %bc = bitcast i32 %min to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: {{^}}trunc_zext_umin:
+; SI: buffer_load_ubyte [[VREG:v[0-9]+]],
+; SI: v_min_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
+; SI-NOT: and
+; SI: buffer_store_short [[RESULT]],
+define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
+  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
+  %tmp2 = zext i8 %tmp5 to i32
+  %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone
+  %tmp4 = trunc i32 %tmp3 to i8
+  %tmp6 = zext i8 %tmp4 to i16
+  store i16 %tmp6, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; Function Attrs: readnone
+declare i32 @llvm.AMDGPU.umin(i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll
new file mode 100644
index 00000000000..76624a078b3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.umul24(i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_umul24:
+; SI: v_mul_u32_u24
+; R600: MUL_UINT24
+; R600: MULLO_UINT
+define void @test_umul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %mul = call i32 @llvm.AMDGPU.umul24(i32 %src0, i32 %src1) nounwind readnone
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
new file mode 100644
index 00000000000..3d05da616e4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
@@ -0,0 +1,59 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+
+;GCN-LABEL: {{^}}main:
+;GCN-NOT: s_wqm
+;GCN: s_mov_b32
+;GCN-NEXT: v_interp_mov_f32
+;GCN: v_interp_p1_f32
+;GCN: v_interp_p2_f32
+
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 {
+main_body:
+  %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
+  %6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4)
+  %7 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %4)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %6, float %7, float %7)
+  ret void
+}
+
+; Thest that v_interp_p1 uses different source and destination registers
+; on 16 bank LDS chips.
+
+; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug:
+; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]]
+
+define void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
+main_body:
+  %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7)
+  %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
+  %24 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %5, <2 x i32> %7)
+  %25 = call float @fabs(float %22)
+  %26 = call float @fabs(float %23)
+  %27 = call float @fabs(float %24)
+  %28 = call i32 @llvm.SI.packf16(float %25, float %26)
+  %29 = bitcast i32 %28 to float
+  %30 = call i32 @llvm.SI.packf16(float %27, float 1.000000e+00)
+  %31 = bitcast i32 %30 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %29, float %31, float %29, float %31)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @fabs(float) #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.constant(i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.gather4.ll b/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
new file mode 100644
index 00000000000..275cb580bc9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
@@ -0,0 +1,509 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}gather4_v2:
+;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_v2() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4:
+;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_cl:
+;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_l:
+;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_b:
+;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_b_cl:
+;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_b_cl_v8:
+;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_cl_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_lz_v2:
+;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_lz_v2() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_lz:
+;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+
+;CHECK-LABEL: {{^}}gather4_o:
+;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_cl_o:
+;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_cl_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_cl_o_v8:
+;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_cl_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_l_o:
+;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_l_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_l_o_v8:
+;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_l_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_b_o:
+;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_b_o_v8:
+;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_b_cl_o:
+;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_cl_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_lz_o:
+;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_lz_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+
+;CHECK-LABEL: {{^}}gather4_c:
+;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_cl:
+;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_cl_v8:
+;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_cl_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_l:
+;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_l_v8:
+;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_l_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_b:
+;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_b_v8:
+;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_b_cl:
+;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_lz:
+;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+
+;CHECK-LABEL: {{^}}gather4_c_o:
+;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_o_v8:
+;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_cl_o:
+;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_cl_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_l_o:
+;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_l_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_b_o:
+;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_b_cl_o:
+;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b_cl_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_lz_o:
+;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_lz_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_lz_o_v8:
+;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_lz_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+
+declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.getlod.ll b/test/CodeGen/AMDGPU/llvm.SI.getlod.ll
new file mode 100644
index 00000000000..06ee98e91b3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.getlod.ll
@@ -0,0 +1,45 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}getlod:
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @getlod() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}getlod_v2:
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @getlod_v2() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}getlod_v4:
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @getlod_v4() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.getlod.i32(i32, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.ll b/test/CodeGen/AMDGPU/llvm.SI.image.ll
new file mode 100644
index 00000000000..0fac8d79956
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.image.ll
@@ -0,0 +1,50 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}image_load:
+;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @image_load() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}image_load_mip:
+;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @image_load_mip() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}getresinfo:
+;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @getresinfo() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll
new file mode 100644
index 00000000000..4bc638a2806
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll
@@ -0,0 +1,310 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}sample:
+;CHECK: s_wqm
+;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_l:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b:
+;CHECK: s_wqm
+;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_lz:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c:
+;CHECK: s_wqm
+;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_l:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b:
+;CHECK: s_wqm
+;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_lz:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll
new file mode 100644
index 00000000000..9d8935414ed
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll
@@ -0,0 +1,310 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}sample:
+;CHECK: s_wqm
+;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_l:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b:
+;CHECK: s_wqm
+;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_lz:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c:
+;CHECK: s_wqm
+;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_l:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b:
+;CHECK: s_wqm
+;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_lz:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.imageload.ll b/test/CodeGen/AMDGPU/llvm.SI.imageload.ll
new file mode 100644
index 00000000000..b67716c3b66
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.imageload.ll
@@ -0,0 +1,132 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-DAG: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 2, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 1, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 4, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, -1
+
+define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+   %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
+   %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
+   %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
+   %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
+   %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
+   %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
+   %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
+   %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
+   %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
+   %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
+   %res1 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v1,
+      <32 x i8> undef, i32 1)
+   %res2 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v2,
+      <32 x i8> undef, i32 2)
+   %res3 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v3,
+      <32 x i8> undef, i32 3)
+   %res4 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v4,
+      <32 x i8> undef, i32 4)
+   %res5 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v5,
+      <32 x i8> undef, i32 5)
+   %res6 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v6,
+      <32 x i8> undef, i32 6)
+   %res10 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v10,
+      <32 x i8> undef, i32 10)
+   %res11 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v11,
+      <32 x i8> undef, i32 11)
+   %res15 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v15,
+      <32 x i8> undef, i32 15)
+   %res16 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v16,
+      <32 x i8> undef, i32 16)
+   %e1 = extractelement <4 x i32> %res1, i32 0
+   %e2 = extractelement <4 x i32> %res2, i32 1
+   %e3 = extractelement <4 x i32> %res3, i32 2
+   %e4 = extractelement <4 x i32> %res4, i32 3
+   %t0 = extractelement <4 x i32> %res5, i32 0
+   %t1 = extractelement <4 x i32> %res5, i32 1
+   %e5 = add i32 %t0, %t1
+   %t2 = extractelement <4 x i32> %res6, i32 0
+   %t3 = extractelement <4 x i32> %res6, i32 2
+   %e6 = add i32 %t2, %t3
+   %t10 = extractelement <4 x i32> %res10, i32 2
+   %t11 = extractelement <4 x i32> %res10, i32 3
+   %e10 = add i32 %t10, %t11
+   %t12 = extractelement <4 x i32> %res11, i32 0
+   %t13 = extractelement <4 x i32> %res11, i32 1
+   %t14 = extractelement <4 x i32> %res11, i32 2
+   %t15 = add i32 %t12, %t13
+   %e11 = add i32 %t14, %t15
+   %t28 = extractelement <4 x i32> %res15, i32 0
+   %t29 = extractelement <4 x i32> %res15, i32 1
+   %t30 = extractelement <4 x i32> %res15, i32 2
+   %t31 = extractelement <4 x i32> %res15, i32 3
+   %t32 = add i32 %t28, %t29
+   %t33 = add i32 %t30, %t31
+   %e15 = add i32 %t32, %t33
+   %e16 = extractelement <4 x i32> %res16, i32 3
+   %s1 = add i32 %e1, %e2
+   %s2 = add i32 %s1, %e3
+   %s3 = add i32 %s2, %e4
+   %s4 = add i32 %s3, %e5
+   %s5 = add i32 %s4, %e6
+   %s9 = add i32 %s5, %e10
+   %s10 = add i32 %s9, %e11
+   %s14 = add i32 %s10, %e15
+   %s15 = add i32 %s14, %e16
+   %s16 = bitcast i32 %s15 to float
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16)
+   ret void
+}
+
+; Test that ccordinates are stored in vgprs and not sgprs
+; CHECK: vgpr_coords
+; CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}
+define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr float addrspace(2)*, float addrspace(2)* addrspace(2)* %0, i32 0
+  %21 = load float addrspace(2)*, float addrspace(2)* addrspace(2)* %20, !tbaa !2
+  %22 = getelementptr float, float addrspace(2)* %21, i32 0
+  %23 = load float, float addrspace(2)* %22, !tbaa !2, !invariant.load !1
+  %24 = getelementptr float, float addrspace(2)* %21, i32 1
+  %25 = load float, float addrspace(2)* %24, !tbaa !2, !invariant.load !1
+  %26 = getelementptr float, float addrspace(2)* %21, i32 4
+  %27 = load float, float addrspace(2)* %26, !tbaa !2, !invariant.load !1
+  %28 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
+  %29 = load <32 x i8>, <32 x i8> addrspace(2)* %28, !tbaa !2
+  %30 = bitcast float %27 to i32
+  %31 = bitcast float %23 to i32
+  %32 = bitcast float %25 to i32
+  %33 = insertelement <4 x i32> undef, i32 %31, i32 0
+  %34 = insertelement <4 x i32> %33, i32 %32, i32 1
+  %35 = insertelement <4 x i32> %34, i32 %30, i32 2
+  %36 = insertelement <4 x i32> %35, i32 undef, i32 3
+  %37 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> %36, <32 x i8> %29, i32 2)
+  %38 = extractelement <4 x i32> %37, i32 0
+  %39 = extractelement <4 x i32> %37, i32 1
+  %40 = extractelement <4 x i32> %37, i32 2
+  %41 = extractelement <4 x i32> %37, i32 3
+  %42 = bitcast i32 %38 to float
+  %43 = bitcast i32 %39 to float
+  %44 = bitcast i32 %40 to float
+  %45 = bitcast i32 %41 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %42, float %43, float %44, float %45)
+  ret void
+}
+
+declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <32 x i8>, i32) readnone
+; Function Attrs: nounwind readnone
+declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null}
+!1 = !{}
+!2 = !{!0, !0, i64 0, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
new file mode 100644
index 00000000000..f6c258539d5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+
+; Example of a simple geometry shader loading vertex attributes from the
+; ESGS ring buffer
+
+; FIXME: Out of bounds immediate offset crashes
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc
+; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc
+
+define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 {
+main_body:
+  %tmp = getelementptr [2 x <16 x i8>], [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1
+  %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp11 = shl i32 %arg6, 2
+  %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0)
+  %tmp13 = bitcast i32 %tmp12 to float
+  %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %tmp15 = bitcast i32 %tmp14 to float
+  %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0)
+  %tmp17 = bitcast i32 %tmp16 to float
+  %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp19 = bitcast i32 %tmp18 to float
+
+  %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp21 = bitcast i32 %tmp20 to float
+
+  %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp23 = bitcast i32 %tmp22 to float
+
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp13, float %tmp15, float %tmp17, float %tmp19)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp21, float %tmp23, float %tmp23, float %tmp23)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readonly
+declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { nounwind readonly }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll b/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll
new file mode 100644
index 00000000000..ac95fd0b83a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll
@@ -0,0 +1,111 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 2, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 1, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 4, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, -1
+
+define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8,
+		  i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) {
+   %res1 = call <4 x i32> @llvm.SI.resinfo(i32 %a1, <32 x i8> undef, i32 1)
+   %res2 = call <4 x i32> @llvm.SI.resinfo(i32 %a2, <32 x i8> undef, i32 2)
+   %res3 = call <4 x i32> @llvm.SI.resinfo(i32 %a3, <32 x i8> undef, i32 3)
+   %res4 = call <4 x i32> @llvm.SI.resinfo(i32 %a4, <32 x i8> undef, i32 4)
+   %res5 = call <4 x i32> @llvm.SI.resinfo(i32 %a5, <32 x i8> undef, i32 5)
+   %res6 = call <4 x i32> @llvm.SI.resinfo(i32 %a6, <32 x i8> undef, i32 6)
+   %res7 = call <4 x i32> @llvm.SI.resinfo(i32 %a7, <32 x i8> undef, i32 7)
+   %res8 = call <4 x i32> @llvm.SI.resinfo(i32 %a8, <32 x i8> undef, i32 8)
+   %res9 = call <4 x i32> @llvm.SI.resinfo(i32 %a9, <32 x i8> undef, i32 9)
+   %res10 = call <4 x i32> @llvm.SI.resinfo(i32 %a10, <32 x i8> undef, i32 10)
+   %res11 = call <4 x i32> @llvm.SI.resinfo(i32 %a11, <32 x i8> undef, i32 11)
+   %res12 = call <4 x i32> @llvm.SI.resinfo(i32 %a12, <32 x i8> undef, i32 12)
+   %res13 = call <4 x i32> @llvm.SI.resinfo(i32 %a13, <32 x i8> undef, i32 13)
+   %res14 = call <4 x i32> @llvm.SI.resinfo(i32 %a14, <32 x i8> undef, i32 14)
+   %res15 = call <4 x i32> @llvm.SI.resinfo(i32 %a15, <32 x i8> undef, i32 15)
+   %res16 = call <4 x i32> @llvm.SI.resinfo(i32 %a16, <32 x i8> undef, i32 16)
+   %e1 = extractelement <4 x i32> %res1, i32 0
+   %e2 = extractelement <4 x i32> %res2, i32 1
+   %e3 = extractelement <4 x i32> %res3, i32 2
+   %e4 = extractelement <4 x i32> %res4, i32 3
+   %t0 = extractelement <4 x i32> %res5, i32 0
+   %t1 = extractelement <4 x i32> %res5, i32 1
+   %e5 = add i32 %t0, %t1
+   %t2 = extractelement <4 x i32> %res6, i32 0
+   %t3 = extractelement <4 x i32> %res6, i32 2
+   %e6 = add i32 %t2, %t3
+   %t4 = extractelement <4 x i32> %res7, i32 0
+   %t5 = extractelement <4 x i32> %res7, i32 3
+   %e7 = add i32 %t4, %t5
+   %t6 = extractelement <4 x i32> %res8, i32 1
+   %t7 = extractelement <4 x i32> %res8, i32 2
+   %e8 = add i32 %t6, %t7
+   %t8 = extractelement <4 x i32> %res9, i32 1
+   %t9 = extractelement <4 x i32> %res9, i32 3
+   %e9 = add i32 %t8, %t9
+   %t10 = extractelement <4 x i32> %res10, i32 2
+   %t11 = extractelement <4 x i32> %res10, i32 3
+   %e10 = add i32 %t10, %t11
+   %t12 = extractelement <4 x i32> %res11, i32 0
+   %t13 = extractelement <4 x i32> %res11, i32 1
+   %t14 = extractelement <4 x i32> %res11, i32 2
+   %t15 = add i32 %t12, %t13
+   %e11 = add i32 %t14, %t15
+   %t16 = extractelement <4 x i32> %res12, i32 0
+   %t17 = extractelement <4 x i32> %res12, i32 1
+   %t18 = extractelement <4 x i32> %res12, i32 3
+   %t19 = add i32 %t16, %t17
+   %e12 = add i32 %t18, %t19
+   %t20 = extractelement <4 x i32> %res13, i32 0
+   %t21 = extractelement <4 x i32> %res13, i32 2
+   %t22 = extractelement <4 x i32> %res13, i32 3
+   %t23 = add i32 %t20, %t21
+   %e13 = add i32 %t22, %t23
+   %t24 = extractelement <4 x i32> %res14, i32 1
+   %t25 = extractelement <4 x i32> %res14, i32 2
+   %t26 = extractelement <4 x i32> %res14, i32 3
+   %t27 = add i32 %t24, %t25
+   %e14 = add i32 %t26, %t27
+   %t28 = extractelement <4 x i32> %res15, i32 0
+   %t29 = extractelement <4 x i32> %res15, i32 1
+   %t30 = extractelement <4 x i32> %res15, i32 2
+   %t31 = extractelement <4 x i32> %res15, i32 3
+   %t32 = add i32 %t28, %t29
+   %t33 = add i32 %t30, %t31
+   %e15 = add i32 %t32, %t33
+   %e16 = extractelement <4 x i32> %res16, i32 3
+   %s1 = add i32 %e1, %e2
+   %s2 = add i32 %s1, %e3
+   %s3 = add i32 %s2, %e4
+   %s4 = add i32 %s3, %e5
+   %s5 = add i32 %s4, %e6
+   %s6 = add i32 %s5, %e7
+   %s7 = add i32 %s6, %e8
+   %s8 = add i32 %s7, %e9
+   %s9 = add i32 %s8, %e10
+   %s10 = add i32 %s9, %e11
+   %s11 = add i32 %s10, %e12
+   %s12 = add i32 %s11, %e13
+   %s13 = add i32 %s12, %e14
+   %s14 = add i32 %s13, %e15
+   %s15 = add i32 %s14, %e16
+   %s16 = bitcast i32 %s15 to float
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16)
+   ret void
+}
+
+declare <4 x i32> @llvm.SI.resinfo(i32, <32 x i8>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll b/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll
new file mode 100644
index 00000000000..ce9558cbf81
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll
@@ -0,0 +1,96 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s
+
+; CHECK-LABEL: {{^}}v1:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 13
+define void @v1(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 2
+  %4 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v2:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 11
+define void @v2(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 1
+  %4 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v3:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14
+define void @v3(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 1
+  %3 = extractelement <4 x float> %1, i32 2
+  %4 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v4:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 7
+define void @v4(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 1
+  %4 = extractelement <4 x float> %1, i32 2
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v5:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10
+define void @v5(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 1
+  %3 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v6:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 6
+define void @v6(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 1
+  %3 = extractelement <4 x float> %1, i32 2
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v7:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 9
+define void @v7(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+  ret void
+}
+
+declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sample.ll b/test/CodeGen/AMDGPU/llvm.SI.sample.ll
new file mode 100644
index 00000000000..509c45f588b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.sample.ll
@@ -0,0 +1,160 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 3
+;CHECK-DAG: image_sample {{v[0-9]+}}, 2
+;CHECK-DAG: image_sample {{v[0-9]+}}, 1
+;CHECK-DAG: image_sample {{v[0-9]+}}, 4
+;CHECK-DAG: image_sample {{v[0-9]+}}, 8
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 5
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 9
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 6
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 12
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 7
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 11
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 13
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14
+;CHECK-DAG: image_sample {{v[0-9]+}}, 8
+
+define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
+   %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
+   %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
+   %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
+   %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
+   %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
+   %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
+   %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2
+   %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3
+   %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0
+   %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
+   %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
+   %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3
+   %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0
+   %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1
+   %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
+   %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
+   %res1 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v1,
+      <32 x i8> undef, <16 x i8> undef, i32 1)
+   %res2 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v2,
+      <32 x i8> undef, <16 x i8> undef, i32 2)
+   %res3 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v3,
+      <32 x i8> undef, <16 x i8> undef, i32 3)
+   %res4 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v4,
+      <32 x i8> undef, <16 x i8> undef, i32 4)
+   %res5 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v5,
+      <32 x i8> undef, <16 x i8> undef, i32 5)
+   %res6 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v6,
+      <32 x i8> undef, <16 x i8> undef, i32 6)
+   %res7 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v7,
+      <32 x i8> undef, <16 x i8> undef, i32 7)
+   %res8 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v8,
+      <32 x i8> undef, <16 x i8> undef, i32 8)
+   %res9 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v9,
+      <32 x i8> undef, <16 x i8> undef, i32 9)
+   %res10 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v10,
+      <32 x i8> undef, <16 x i8> undef, i32 10)
+   %res11 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v11,
+      <32 x i8> undef, <16 x i8> undef, i32 11)
+   %res12 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v12,
+      <32 x i8> undef, <16 x i8> undef, i32 12)
+   %res13 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v13,
+      <32 x i8> undef, <16 x i8> undef, i32 13)
+   %res14 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v14,
+      <32 x i8> undef, <16 x i8> undef, i32 14)
+   %res15 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v15,
+      <32 x i8> undef, <16 x i8> undef, i32 15)
+   %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16,
+      <32 x i8> undef, <16 x i8> undef, i32 16)
+   %e1 = extractelement <4 x float> %res1, i32 0
+   %e2 = extractelement <4 x float> %res2, i32 1
+   %e3 = extractelement <4 x float> %res3, i32 2
+   %e4 = extractelement <4 x float> %res4, i32 3
+   %t0 = extractelement <4 x float> %res5, i32 0
+   %t1 = extractelement <4 x float> %res5, i32 1
+   %e5 = fadd float %t0, %t1
+   %t2 = extractelement <4 x float> %res6, i32 0
+   %t3 = extractelement <4 x float> %res6, i32 2
+   %e6 = fadd float %t2, %t3
+   %t4 = extractelement <4 x float> %res7, i32 0
+   %t5 = extractelement <4 x float> %res7, i32 3
+   %e7 = fadd float %t4, %t5
+   %t6 = extractelement <4 x float> %res8, i32 1
+   %t7 = extractelement <4 x float> %res8, i32 2
+   %e8 = fadd float %t6, %t7
+   %t8 = extractelement <4 x float> %res9, i32 1
+   %t9 = extractelement <4 x float> %res9, i32 3
+   %e9 = fadd float %t8, %t9
+   %t10 = extractelement <4 x float> %res10, i32 2
+   %t11 = extractelement <4 x float> %res10, i32 3
+   %e10 = fadd float %t10, %t11
+   %t12 = extractelement <4 x float> %res11, i32 0
+   %t13 = extractelement <4 x float> %res11, i32 1
+   %t14 = extractelement <4 x float> %res11, i32 2
+   %t15 = fadd float %t12, %t13
+   %e11 = fadd float %t14, %t15
+   %t16 = extractelement <4 x float> %res12, i32 0
+   %t17 = extractelement <4 x float> %res12, i32 1
+   %t18 = extractelement <4 x float> %res12, i32 3
+   %t19 = fadd float %t16, %t17
+   %e12 = fadd float %t18, %t19
+   %t20 = extractelement <4 x float> %res13, i32 0
+   %t21 = extractelement <4 x float> %res13, i32 2
+   %t22 = extractelement <4 x float> %res13, i32 3
+   %t23 = fadd float %t20, %t21
+   %e13 = fadd float %t22, %t23
+   %t24 = extractelement <4 x float> %res14, i32 1
+   %t25 = extractelement <4 x float> %res14, i32 2
+   %t26 = extractelement <4 x float> %res14, i32 3
+   %t27 = fadd float %t24, %t25
+   %e14 = fadd float %t26, %t27
+   %t28 = extractelement <4 x float> %res15, i32 0
+   %t29 = extractelement <4 x float> %res15, i32 1
+   %t30 = extractelement <4 x float> %res15, i32 2
+   %t31 = extractelement <4 x float> %res15, i32 3
+   %t32 = fadd float %t28, %t29
+   %t33 = fadd float %t30, %t31
+   %e15 = fadd float %t32, %t33
+   %e16 = extractelement <4 x float> %res16, i32 3
+   %s1 = fadd float %e1, %e2
+   %s2 = fadd float %s1, %e3
+   %s3 = fadd float %s2, %e4
+   %s4 = fadd float %s3, %e5
+   %s5 = fadd float %s4, %e6
+   %s6 = fadd float %s5, %e7
+   %s7 = fadd float %s6, %e8
+   %s8 = fadd float %s7, %e9
+   %s9 = fadd float %s8, %e10
+   %s10 = fadd float %s9, %e11
+   %s11 = fadd float %s10, %e12
+   %s12 = fadd float %s11, %e13
+   %s13 = fadd float %s12, %e14
+   %s14 = fadd float %s13, %e15
+   %s15 = fadd float %s14, %e16
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15)
+   ret void
+}
+
+; CHECK: {{^}}v1:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
+define void @v1(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 1
+  %4 = extractelement <4 x float> %1, i32 2
+  %5 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %5)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+
+declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sampled.ll b/test/CodeGen/AMDGPU/llvm.SI.sampled.ll
new file mode 100644
index 00000000000..f2badff2a99
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.sampled.ll
@@ -0,0 +1,143 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 3
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 2
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 1
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 4
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 5
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 9
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 6
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 10
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 12
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 7
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 11
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 13
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 14
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8
+
+define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
+   %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
+   %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
+   %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
+   %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
+   %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
+   %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
+   %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2
+   %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3
+   %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0
+   %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
+   %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
+   %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3
+   %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0
+   %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1
+   %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
+   %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
+   %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1,
+      <32 x i8> undef, <16 x i8> undef, i32 1)
+   %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2,
+      <32 x i8> undef, <16 x i8> undef, i32 2)
+   %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3,
+      <32 x i8> undef, <16 x i8> undef, i32 3)
+   %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4,
+      <32 x i8> undef, <16 x i8> undef, i32 4)
+   %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5,
+      <32 x i8> undef, <16 x i8> undef, i32 5)
+   %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6,
+      <32 x i8> undef, <16 x i8> undef, i32 6)
+   %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7,
+      <32 x i8> undef, <16 x i8> undef, i32 7)
+   %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8,
+      <32 x i8> undef, <16 x i8> undef, i32 8)
+   %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9,
+      <32 x i8> undef, <16 x i8> undef, i32 9)
+   %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10,
+      <32 x i8> undef, <16 x i8> undef, i32 10)
+   %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11,
+      <32 x i8> undef, <16 x i8> undef, i32 11)
+   %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12,
+      <32 x i8> undef, <16 x i8> undef, i32 12)
+   %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13,
+      <32 x i8> undef, <16 x i8> undef, i32 13)
+   %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14,
+      <32 x i8> undef, <16 x i8> undef, i32 14)
+   %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15,
+      <32 x i8> undef, <16 x i8> undef, i32 15)
+   %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16,
+      <32 x i8> undef, <16 x i8> undef, i32 16)
+   %e1 = extractelement <4 x float> %res1, i32 0
+   %e2 = extractelement <4 x float> %res2, i32 1
+   %e3 = extractelement <4 x float> %res3, i32 2
+   %e4 = extractelement <4 x float> %res4, i32 3
+   %t0 = extractelement <4 x float> %res5, i32 0
+   %t1 = extractelement <4 x float> %res5, i32 1
+   %e5 = fadd float %t0, %t1
+   %t2 = extractelement <4 x float> %res6, i32 0
+   %t3 = extractelement <4 x float> %res6, i32 2
+   %e6 = fadd float %t2, %t3
+   %t4 = extractelement <4 x float> %res7, i32 0
+   %t5 = extractelement <4 x float> %res7, i32 3
+   %e7 = fadd float %t4, %t5
+   %t6 = extractelement <4 x float> %res8, i32 1
+   %t7 = extractelement <4 x float> %res8, i32 2
+   %e8 = fadd float %t6, %t7
+   %t8 = extractelement <4 x float> %res9, i32 1
+   %t9 = extractelement <4 x float> %res9, i32 3
+   %e9 = fadd float %t8, %t9
+   %t10 = extractelement <4 x float> %res10, i32 2
+   %t11 = extractelement <4 x float> %res10, i32 3
+   %e10 = fadd float %t10, %t11
+   %t12 = extractelement <4 x float> %res11, i32 0
+   %t13 = extractelement <4 x float> %res11, i32 1
+   %t14 = extractelement <4 x float> %res11, i32 2
+   %t15 = fadd float %t12, %t13
+   %e11 = fadd float %t14, %t15
+   %t16 = extractelement <4 x float> %res12, i32 0
+   %t17 = extractelement <4 x float> %res12, i32 1
+   %t18 = extractelement <4 x float> %res12, i32 3
+   %t19 = fadd float %t16, %t17
+   %e12 = fadd float %t18, %t19
+   %t20 = extractelement <4 x float> %res13, i32 0
+   %t21 = extractelement <4 x float> %res13, i32 2
+   %t22 = extractelement <4 x float> %res13, i32 3
+   %t23 = fadd float %t20, %t21
+   %e13 = fadd float %t22, %t23
+   %t24 = extractelement <4 x float> %res14, i32 1
+   %t25 = extractelement <4 x float> %res14, i32 2
+   %t26 = extractelement <4 x float> %res14, i32 3
+   %t27 = fadd float %t24, %t25
+   %e14 = fadd float %t26, %t27
+   %t28 = extractelement <4 x float> %res15, i32 0
+   %t29 = extractelement <4 x float> %res15, i32 1
+   %t30 = extractelement <4 x float> %res15, i32 2
+   %t31 = extractelement <4 x float> %res15, i32 3
+   %t32 = fadd float %t28, %t29
+   %t33 = fadd float %t30, %t31
+   %e15 = fadd float %t32, %t33
+   %e16 = extractelement <4 x float> %res16, i32 3
+   %s1 = fadd float %e1, %e2
+   %s2 = fadd float %s1, %e3
+   %s3 = fadd float %s2, %e4
+   %s4 = fadd float %s3, %e5
+   %s5 = fadd float %s4, %e6
+   %s6 = fadd float %s5, %e7
+   %s7 = fadd float %s6, %e8
+   %s8 = fadd float %s7, %e9
+   %s9 = fadd float %s8, %e10
+   %s10 = fadd float %s9, %e11
+   %s11 = fadd float %s10, %e12
+   %s12 = fadd float %s11, %e13
+   %s13 = fadd float %s12, %e14
+   %s14 = fadd float %s13, %e15
+   %s15 = fadd float %s14, %e16
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15)
+   ret void
+}
+
+declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
new file mode 100644
index 00000000000..2198590f2df
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
@@ -0,0 +1,20 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=BOTH %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=BOTH %s
+
+; BOTH-LABEL: {{^}}main:
+; BOTH: s_mov_b32 m0, s0
+; VI-NEXT: s_nop 0
+; BOTH-NEXT: s_sendmsg Gs_done(nop)
+; BOTH-NEXT: s_endpgm
+
+define void @main(i32 inreg %a) #0 {
+main_body:
+  call void @llvm.SI.sendmsg(i32 3, i32 %a)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.SI.sendmsg(i32, i32) #1
+
+attributes #0 = { "ShaderType"="2" "unsafe-fp-math"="true" }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll
new file mode 100644
index 00000000000..09675d50335
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll
@@ -0,0 +1,24 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg Gs(emit stream 0)
+; CHECK: s_sendmsg Gs(cut stream 1)
+; CHECK: s_sendmsg Gs(emit-cut stream 2)
+; CHECK: s_sendmsg Gs_done(nop)
+
+define void @main() {
+main_body:
+  call void @llvm.SI.sendmsg(i32 34, i32 0);
+  call void @llvm.SI.sendmsg(i32 274, i32 0);
+  call void @llvm.SI.sendmsg(i32 562, i32 0);
+  call void @llvm.SI.sendmsg(i32 3, i32 0);
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.SI.sendmsg(i32, i32) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
new file mode 100644
index 00000000000..71f51548a5f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
@@ -0,0 +1,47 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}test1:
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test1(i32 %a1, i32 %vaddr) #0 {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test2:
+;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test2(i32 %a1, i32 %vaddr) #0 {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test3:
+;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test3(i32 %a1, i32 %vaddr) #0 {
+    %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
+        i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test4:
+;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test4(i32 %vdata, i32 %vaddr) #0 {
+    call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
+        i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.tid.ll b/test/CodeGen/AMDGPU/llvm.SI.tid.ll
new file mode 100644
index 00000000000..f6e6d7050ba
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.tid.ll
@@ -0,0 +1,18 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
+
+;GCN: v_mbcnt_lo_u32_b32_e64
+;SI: v_mbcnt_hi_u32_b32_e32
+;VI: v_mbcnt_hi_u32_b32_e64
+
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+main_body:
+  %4 = call i32 @llvm.SI.tid()
+  %5 = bitcast i32 %4 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
+  ret void
+}
+
+declare i32 @llvm.SI.tid() readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll
new file mode 100644
index 00000000000..036cd2ca82a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone
+
+define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
+  %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
+  %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16
+  %dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
+  store float %dp4, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll
new file mode 100644
index 00000000000..42df6db1ccf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}kilp_gs_const:
+; SI: s_mov_b64 exec, 0
+define void @kilp_gs_const() #0 {
+main_body:
+  %0 = icmp ule i32 0, 3
+  %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kilp(float %1)
+  %2 = icmp ule i32 3, 0
+  %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kilp(float %3)
+  ret void
+}
+
+declare void @llvm.AMDGPU.kilp(float)
+
+attributes #0 = { "ShaderType"="2" }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
new file mode 100644
index 00000000000..4e4c2ec7791
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_lrp:
+; SI: v_sub_f32
+; SI: v_mad_f32
+define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind {
+  %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone
+  store float %mad, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.cos.ll b/test/CodeGen/AMDGPU/llvm.cos.ll
new file mode 100644
index 00000000000..c65df8b3e8d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.cos.ll
@@ -0,0 +1,41 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+
+;FUNC-LABEL: test
+;EG: MULADD_IEEE *
+;EG: FRACT *
+;EG: ADD *
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG-NOT: COS
+;SI: v_cos_f32
+;SI-NOT: v_cos_f32
+
+define void @test(float addrspace(1)* %out, float %x) #1 {
+   %cos = call float @llvm.cos.f32(float %x)
+   store float %cos, float addrspace(1)* %out
+   ret void
+}
+
+;FUNC-LABEL: testv
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG-NOT: COS
+;SI: v_cos_f32
+;SI: v_cos_f32
+;SI: v_cos_f32
+;SI: v_cos_f32
+;SI-NOT: v_cos_f32
+
+define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
+   %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx)
+   store <4 x float> %cos, <4 x float> addrspace(1)* %out
+   ret void
+}
+
+declare float @llvm.cos.f32(float) readnone
+declare <4 x float> @llvm.cos.v4f32(<4 x float>) readnone
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.exp2.ll b/test/CodeGen/AMDGPU/llvm.exp2.ll
new file mode 100644
index 00000000000..42698925aae
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -0,0 +1,80 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+;FUNC-LABEL: {{^}}test:
+;EG: EXP_IEEE
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_exp_f32
+
+define void @test(float addrspace(1)* %out, float %in) {
+entry:
+   %0 = call float @llvm.exp2.f32(float %in)
+   store float %0, float addrspace(1)* %out
+   ret void
+}
+
+;FUNC-LABEL: {{^}}testv2:
+;EG: EXP_IEEE
+;EG: EXP_IEEE
+; FIXME: We should be able to merge these packets together on Cayman so we
+; have a maximum of 4 instructions.
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_exp_f32
+;SI: v_exp_f32
+
+define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}testv4:
+;EG: EXP_IEEE
+;EG: EXP_IEEE
+;EG: EXP_IEEE
+;EG: EXP_IEEE
+; FIXME: We should be able to merge these packets together on Cayman so we
+; have a maximum of 4 instructions.
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_exp_f32
+;SI: v_exp_f32
+;SI: v_exp_f32
+;SI: v_exp_f32
+define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.exp2.f32(float) readnone
+declare <2 x float> @llvm.exp2.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.exp2.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/AMDGPU/llvm.log2.ll b/test/CodeGen/AMDGPU/llvm.log2.ll
new file mode 100644
index 00000000000..c75e7850b35
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -0,0 +1,80 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+;FUNC-LABEL: {{^}}test:
+;EG: LOG_IEEE
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_log_f32
+
+define void @test(float addrspace(1)* %out, float %in) {
+entry:
+   %0 = call float @llvm.log2.f32(float %in)
+   store float %0, float addrspace(1)* %out
+   ret void
+}
+
+;FUNC-LABEL: {{^}}testv2:
+;EG: LOG_IEEE
+;EG: LOG_IEEE
+; FIXME: We should be able to merge these packets together on Cayman so we
+; have a maximum of 4 instructions.
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_log_f32
+;SI: v_log_f32
+
+define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}testv4:
+;EG: LOG_IEEE
+;EG: LOG_IEEE
+;EG: LOG_IEEE
+;EG: LOG_IEEE
+; FIXME: We should be able to merge these packets together on Cayman so we
+; have a maximum of 4 instructions.
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_log_f32
+;SI: v_log_f32
+;SI: v_log_f32
+;SI: v_log_f32
+define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.log2.f32(float) readnone
+declare <2 x float> @llvm.log2.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.log2.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/AMDGPU/llvm.memcpy.ll b/test/CodeGen/AMDGPU/llvm.memcpy.ll
new file mode 100644
index 00000000000..e491732cf9c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -0,0 +1,365 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
+
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2:
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4:
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind
+  ret void
+}
+
+; FIXME: Use 64-bit ops
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8:
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1:
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2:
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4:
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8:
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16:
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.pow.ll b/test/CodeGen/AMDGPU/llvm.pow.ll
new file mode 100644
index 00000000000..c4ae652619c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.pow.ll
@@ -0,0 +1,40 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK-LABEL: test1:
+;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
+
+define void @test1(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = call float @llvm.pow.f32( float %r0, float %r1)
+   %vec = insertelement <4 x float> undef, float %r2, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+;CHECK-LABEL: test2:
+;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
+;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
+define void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+   %vec = call <4 x float> @llvm.pow.v4f32( <4 x float> %reg0, <4 x float> %reg1)
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare float @llvm.pow.f32(float ,float ) readonly
+declare <4 x float> @llvm.pow.v4f32(<4 x float> ,<4 x float> ) readonly
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/test/CodeGen/AMDGPU/llvm.rint.f64.ll
new file mode 100644
index 00000000000..c63fb172794
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.rint.f64.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}rint_f64:
+; CI: v_rndne_f64_e32
+
+; SI-DAG: v_add_f64
+; SI-DAG: v_add_f64
+; SI-DAG v_cmp_gt_f64_e64
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
+; SI: s_endpgm
+define void @rint_f64(double addrspace(1)* %out, double %in) {
+entry:
+  %0 = call double @llvm.rint.f64(double %in)
+  store double %0, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rint_v2f64:
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
+define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+entry:
+  %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in)
+  store <2 x double> %0, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rint_v4f64:
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
+define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+entry:
+  %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in)
+  store <4 x double> %0, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+
+declare double @llvm.rint.f64(double) #0
+declare <2 x double> @llvm.rint.v2f64(<2 x double>) #0
+declare <4 x double> @llvm.rint.v4f64(<4 x double>) #0
diff --git a/test/CodeGen/AMDGPU/llvm.rint.ll b/test/CodeGen/AMDGPU/llvm.rint.ll
new file mode 100644
index 00000000000..661db51ad03
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.rint.ll
@@ -0,0 +1,62 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}rint_f32:
+; R600: RNDNE
+
+; SI: v_rndne_f32_e32
+define void @rint_f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.rint.f32(float %in) #0
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rint_v2f32:
+; R600: RNDNE
+; R600: RNDNE
+
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
+define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rint_v4f32:
+; R600: RNDNE
+; R600: RNDNE
+; R600: RNDNE
+; R600: RNDNE
+
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
+define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}legacy_amdil_round_nearest_f32:
+; R600: RNDNE
+
+; SI: v_rndne_f32_e32
+define void @legacy_amdil_round_nearest_f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.AMDIL.round.nearest.f32(float %in) #0
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.AMDIL.round.nearest.f32(float) #0
+declare float @llvm.rint.f32(float) #0
+declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0
+declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.round.f64.ll b/test/CodeGen/AMDGPU/llvm.round.f64.ll
new file mode 100644
index 00000000000..3d0f57e3328
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}round_f64:
+; SI: s_endpgm
+define void @round_f64(double addrspace(1)* %out, double %x) #0 {
+  %result = call double @llvm.round.f64(double %x) #1
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; This is a pretty large function, so just test a few of the
+; instructions that are necessary.
+
+; FUNC-LABEL: {{^}}v_round_f64:
+; SI: buffer_load_dwordx2
+; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11
+
+; SI-DAG: v_not_b32_e32
+; SI-DAG: v_not_b32_e32
+
+; SI-DAG: v_cmp_eq_i32
+
+; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff
+; SI-DAG: v_cmp_gt_i32_e64
+; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]]
+
+; SI-DAG: v_cmp_gt_i32_e64
+
+
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %x = load double, double addrspace(1)* %gep
+  %result = call double @llvm.round.f64(double %x) #1
+  store double %result, double addrspace(1)* %out.gep
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v2f64:
+; SI: s_endpgm
+define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
+  %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v4f64:
+; SI: s_endpgm
+define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
+  %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v8f64:
+; SI: s_endpgm
+define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
+  %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
+  store <8 x double> %result, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+declare double @llvm.round.f64(double) #1
+declare <2 x double> @llvm.round.v2f64(<2 x double>) #1
+declare <4 x double> @llvm.round.v4f64(<4 x double>) #1
+declare <8 x double> @llvm.round.v8f64(<8 x double>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll
new file mode 100644
index 00000000000..f5f124d915a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.round.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}round_f32:
+; SI-DAG: s_load_dword [[SX:s[0-9]+]]
+; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff
+; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
+; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
+; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
+; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
+; SI: v_cmp_le_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0.5, |[[SUB]]|
+; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
+; SI: buffer_store_dword [[RESULT]]
+
+; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
+; R600-DAG: ADD  {{.*}},
+; R600-DAG: BFI_INT
+; R600-DAG: SETGE
+; R600-DAG: CNDE
+; R600-DAG: ADD
+define void @round_f32(float addrspace(1)* %out, float %x) #0 {
+  %result = call float @llvm.round.f32(float %x) #1
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; The vector tests are really difficult to verify, since it can be hard to
+; predict how the scheduler will order the instructions.  We already have
+; a test for the scalar case, so the vector tests just check that the
+; compiler doesn't crash.
+
+; FUNC-LABEL: {{^}}round_v2f32:
+; SI: s_endpgm
+; R600: CF_END
+define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
+  %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v4f32:
+; SI: s_endpgm
+; R600: CF_END
+define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
+  %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v8f32:
+; SI: s_endpgm
+; R600: CF_END
+define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
+  %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
+  store <8 x float> %result, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.round.f32(float) #1
+declare <2 x float> @llvm.round.v2f32(<2 x float>) #1
+declare <4 x float> @llvm.round.v4f32(<4 x float>) #1
+declare <8 x float> @llvm.round.v8f32(<8 x float>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.sin.ll b/test/CodeGen/AMDGPU/llvm.sin.ll
new file mode 100644
index 00000000000..3bb245c2e24
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.sin.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
+
+; FUNC-LABEL: sin_f32
+; EG: MULADD_IEEE *
+; EG: FRACT *
+; EG: ADD *
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG-NOT: SIN
+; SI: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+
+define void @sin_f32(float addrspace(1)* %out, float %x) #1 {
+   %sin = call float @llvm.sin.f32(float %x)
+   store float %sin, float addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}sin_3x_f32:
+; SI-UNSAFE-NOT: v_add_f32
+; SI-UNSAFE: 0x3ef47644
+; SI-UNSAFE: v_mul_f32
+; SI-SAFE: v_mul_f32
+; SI-SAFE: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
+  %y = fmul float 3.0, %x
+  %sin = call float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sin_2x_f32:
+; SI-UNSAFE-NOT: v_add_f32
+; SI-UNSAFE: 0x3ea2f983
+; SI-UNSAFE: v_mul_f32
+; SI-SAFE: v_add_f32
+; SI-SAFE: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
+  %y = fmul float 2.0, %x
+  %sin = call float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_2sin_f32:
+; SI-UNSAFE: 0x3ea2f983
+; SI-UNSAFE: v_mul_f32
+; SI-SAFE: v_add_f32
+; SI-SAFE: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @test_2sin_f32(float addrspace(1)* %out, float %x) #1 {
+   %y = fmul float 2.0, %x
+   %sin = call float @llvm.sin.f32(float %y)
+   store float %sin, float addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}sin_v4f32:
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG-NOT: SIN
+; SI: v_sin_f32
+; SI: v_sin_f32
+; SI: v_sin_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+
+define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
+   %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx)
+   store <4 x float> %sin, <4 x float> addrspace(1)* %out
+   ret void
+}
+
+declare float @llvm.sin.f32(float) readnone
+declare <4 x float> @llvm.sin.v4f32(<4 x float>) readnone
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.sqrt.ll b/test/CodeGen/AMDGPU/llvm.sqrt.ll
new file mode 100644
index 00000000000..c6da047f539
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.sqrt.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600
+; RUN: llc < %s -march=amdgcn --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn --mcpu=tonga -verify-machineinstrs| FileCheck %s --check-prefix=SI
+
+; R600-LABEL: {{^}}sqrt_f32:
+; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
+; SI-LABEL: {{^}}sqrt_f32:
+; SI: v_sqrt_f32_e32
+define void @sqrt_f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.sqrt.f32(float %in)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; R600-LABEL: {{^}}sqrt_v2f32:
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
+; SI-LABEL: {{^}}sqrt_v2f32:
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+define void @sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; R600-LABEL: {{^}}sqrt_v4f32:
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
+; SI-LABEL: {{^}}sqrt_v4f32:
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+define void @sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}elim_redun_check:
+; SI: v_sqrt_f32_e32
+; SI-NOT: v_cndmask
+define void @elim_redun_check(float addrspace(1)* %out, float %in) {
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp olt float %in, -0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, float addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}elim_redun_check_ult:
+; SI: v_sqrt_f32_e32
+; SI-NOT: v_cndmask
+define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) {
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp ult float %in, -0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, float addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}elim_redun_check_v2:
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+; SI-NOT: v_cndmask
+define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+  store <2 x float> %res, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}elim_redun_check_v2_ult
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+; SI-NOT: v_cndmask
+define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+  store <2 x float> %res, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.sqrt.f32(float %in)
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/AMDGPU/load-i1.ll b/test/CodeGen/AMDGPU/load-i1.ll
new file mode 100644
index 00000000000..0ca49fde3e7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-i1.ll
@@ -0,0 +1,149 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}global_copy_i1_to_i1:
+; SI: buffer_load_ubyte
+; SI: v_and_b32_e32 v{{[0-9]+}}, 1
+; SI: buffer_store_byte
+; SI: s_endpgm
+
+; EG: VTX_READ_8
+; EG: AND_INT
+define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  store i1 %load, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_copy_i1_to_i1:
+; SI: ds_read_u8
+; SI: v_and_b32_e32 v{{[0-9]+}}, 1
+; SI: ds_write_b8
+; SI: s_endpgm
+
+; EG: LDS_UBYTE_READ_RET
+; EG: AND_INT
+; EG: LDS_BYTE_WRITE
+define void @local_copy_i1_to_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) nounwind {
+  %load = load i1, i1 addrspace(3)* %in
+  store i1 %load, i1 addrspace(3)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_copy_i1_to_i1:
+; SI: buffer_load_ubyte
+; SI: v_and_b32_e32 v{{[0-9]+}}, 1
+; SI: buffer_store_byte
+; SI: s_endpgm
+
+; EG: VTX_READ_8
+; EG: AND_INT
+define void @constant_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) nounwind {
+  %load = load i1, i1 addrspace(2)* %in
+  store i1 %load, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i1_to_i32:
+; SI: buffer_load_ubyte
+; SI: v_bfe_i32
+; SI: buffer_store_dword
+; SI: s_endpgm
+
+; EG: VTX_READ_8
+; EG: BFE_INT
+define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i1_to_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+
+define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i1_to_i64:
+; SI: buffer_load_ubyte
+; SI: v_bfe_i32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i1_to_i64:
+; SI: buffer_load_ubyte
+; SI: v_mov_b32_e32 {{v[0-9]+}}, 0
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg:
+; SI: buffer_load_ubyte
+; SI: v_and_b32_e32
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+  store i1 %x, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_zext_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = zext i1 %x to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_zext_i64:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = zext i1 %x to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_sext_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = sext i1 %x to i32
+  store i32 %ext, i32addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_sext_i64:
+; SI: buffer_load_ubyte
+; SI: v_bfe_i32
+; SI: v_ashrrev_i32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = sext i1 %x to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/load-input-fold.ll b/test/CodeGen/AMDGPU/load-input-fold.ll
new file mode 100644
index 00000000000..1daf0e6527b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-input-fold.ll
@@ -0,0 +1,117 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
+  %5 = extractelement <4 x float> %reg2, i32 1
+  %6 = extractelement <4 x float> %reg2, i32 2
+  %7 = extractelement <4 x float> %reg2, i32 3
+  %8 = extractelement <4 x float> %reg3, i32 0
+  %9 = extractelement <4 x float> %reg3, i32 1
+  %10 = extractelement <4 x float> %reg3, i32 2
+  %11 = extractelement <4 x float> %reg3, i32 3
+  %12 = load <4 x float>, <4 x float> addrspace(8)* null
+  %13 = extractelement <4 x float> %12, i32 0
+  %14 = fmul float %0, %13
+  %15 = load <4 x float>, <4 x float> addrspace(8)* null
+  %16 = extractelement <4 x float> %15, i32 1
+  %17 = fmul float %0, %16
+  %18 = load <4 x float>, <4 x float> addrspace(8)* null
+  %19 = extractelement <4 x float> %18, i32 2
+  %20 = fmul float %0, %19
+  %21 = load <4 x float>, <4 x float> addrspace(8)* null
+  %22 = extractelement <4 x float> %21, i32 3
+  %23 = fmul float %0, %22
+  %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %25 = extractelement <4 x float> %24, i32 0
+  %26 = fmul float %1, %25
+  %27 = fadd float %26, %14
+  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %29 = extractelement <4 x float> %28, i32 1
+  %30 = fmul float %1, %29
+  %31 = fadd float %30, %17
+  %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %33 = extractelement <4 x float> %32, i32 2
+  %34 = fmul float %1, %33
+  %35 = fadd float %34, %20
+  %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %37 = extractelement <4 x float> %36, i32 3
+  %38 = fmul float %1, %37
+  %39 = fadd float %38, %23
+  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %41 = extractelement <4 x float> %40, i32 0
+  %42 = fmul float %2, %41
+  %43 = fadd float %42, %27
+  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %45 = extractelement <4 x float> %44, i32 1
+  %46 = fmul float %2, %45
+  %47 = fadd float %46, %31
+  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %49 = extractelement <4 x float> %48, i32 2
+  %50 = fmul float %2, %49
+  %51 = fadd float %50, %35
+  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %53 = extractelement <4 x float> %52, i32 3
+  %54 = fmul float %2, %53
+  %55 = fadd float %54, %39
+  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %57 = extractelement <4 x float> %56, i32 0
+  %58 = fmul float %3, %57
+  %59 = fadd float %58, %43
+  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %61 = extractelement <4 x float> %60, i32 1
+  %62 = fmul float %3, %61
+  %63 = fadd float %62, %47
+  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %65 = extractelement <4 x float> %64, i32 2
+  %66 = fmul float %3, %65
+  %67 = fadd float %66, %51
+  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %69 = extractelement <4 x float> %68, i32 3
+  %70 = fmul float %3, %69
+  %71 = fadd float %70, %55
+  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %73 = extractelement <4 x float> %72, i32 0
+  %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %75 = extractelement <4 x float> %74, i32 1
+  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %77 = extractelement <4 x float> %76, i32 2
+  %78 = insertelement <4 x float> undef, float %4, i32 0
+  %79 = insertelement <4 x float> %78, float %5, i32 1
+  %80 = insertelement <4 x float> %79, float %6, i32 2
+  %81 = insertelement <4 x float> %80, float 0.000000e+00, i32 3
+  %82 = insertelement <4 x float> undef, float %73, i32 0
+  %83 = insertelement <4 x float> %82, float %75, i32 1
+  %84 = insertelement <4 x float> %83, float %77, i32 2
+  %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
+  %86 = call float @llvm.AMDGPU.dp4(<4 x float> %81, <4 x float> %85)
+  %87 = insertelement <4 x float> undef, float %86, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %87, i32 2, i32 2)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+; Function Attrs: readonly
+declare float @fabs(float) #2
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq(float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.clamp.(float, float, float) #1
+
+; Function Attrs: nounwind readonly
+declare float @llvm.pow.f32(float, float) #3
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
+attributes #2 = { readonly }
+attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/load.ll b/test/CodeGen/AMDGPU/load.ll
new file mode 100644
index 00000000000..93b1b51a0d0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load.ll
@@ -0,0 +1,709 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+;===------------------------------------------------------------------------===;
+; GLOBAL ADDRESS SPACE
+;===------------------------------------------------------------------------===;
+
+; Load an i8 value from the global address space.
+; FUNC-LABEL: {{^}}load_i8:
+; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+
+; SI: buffer_load_ubyte v{{[0-9]+}},
+define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %1 = load i8, i8 addrspace(1)* %in
+  %2 = zext i8 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i8_sext:
+; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; R600: 8
+; SI: buffer_load_sbyte
+define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %0 = load i8, i8 addrspace(1)* %in
+  %1 = sext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i8:
+; R600: VTX_READ_8
+; R600: VTX_READ_8
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %1 = zext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i8_sext:
+; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; R600-DAG: 8
+; R600-DAG: 8
+
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %1 = sext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i8:
+; R600: VTX_READ_8
+; R600: VTX_READ_8
+; R600: VTX_READ_8
+; R600: VTX_READ_8
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %1 = zext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i8_sext:
+; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; R600-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
+; R600-DAG: 8
+; R600-DAG: 8
+; R600-DAG: 8
+; R600-DAG: 8
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %1 = sext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; Load an i16 value from the global address space.
+; FUNC-LABEL: {{^}}load_i16:
+; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ushort
+define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+entry:
+  %0 = load i16	, i16	 addrspace(1)* %in
+  %1 = zext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i16_sext:
+; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; R600: 16
+; SI: buffer_load_sshort
+define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+entry:
+  %0 = load i16, i16 addrspace(1)* %in
+  %1 = sext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i16:
+; R600: VTX_READ_16
+; R600: VTX_READ_16
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %1 = zext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i16_sext:
+; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; R600-DAG: 16
+; R600-DAG: 16
+; SI: buffer_load_sshort
+; SI: buffer_load_sshort
+define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %1 = sext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i16:
+; R600: VTX_READ_16
+; R600: VTX_READ_16
+; R600: VTX_READ_16
+; R600: VTX_READ_16
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %1 = zext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i16_sext:
+; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; R600-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
+; R600-DAG: 16
+; R600-DAG: 16
+; R600-DAG: 16
+; R600-DAG: 16
+; SI: buffer_load_sshort
+; SI: buffer_load_sshort
+; SI: buffer_load_sshort
+; SI: buffer_load_sshort
+define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %1 = sext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; load an i32 value from the global address space.
+; FUNC-LABEL: {{^}}load_i32:
+; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+
+; SI: buffer_load_dword v{{[0-9]+}}
+define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(1)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; load a f32 value from the global address space.
+; FUNC-LABEL: {{^}}load_f32:
+; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+
+; SI: buffer_load_dword v{{[0-9]+}}
+define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %0 = load float, float addrspace(1)* %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; load a v2f32 value from the global address space
+; FUNC-LABEL: {{^}}load_v2f32:
+; R600: MEM_RAT
+; R600: VTX_READ_64
+; SI: buffer_load_dwordx2
+define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x float>, <2 x float> addrspace(1)* %in
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i64:
+; R600: VTX_READ_64
+; SI: buffer_load_dwordx2
+define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+entry:
+  %0 = load i64, i64 addrspace(1)* %in
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i64_sext:
+; R600: MEM_RAT
+; R600: MEM_RAT
+; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.x
+; R600: 31
+; SI: buffer_load_dword
+
+define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(1)* %in
+  %1 = sext i32 %0 to i64
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i64_zext:
+; R600: MEM_RAT
+; R600: MEM_RAT
+define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(1)* %in
+  %1 = zext i32 %0 to i64
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v8i32:
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; XXX: We should be using DWORDX4 instructions on SI.
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
+entry:
+  %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in
+  store <8 x i32> %0, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v16i32:
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; XXX: We should be using DWORDX4 instructions on SI.
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
+entry:
+  %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in
+  store <16 x i32> %0, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+;===------------------------------------------------------------------------===;
+; CONSTANT ADDRESS SPACE
+;===------------------------------------------------------------------------===;
+
+; Load a sign-extended i8 value
+; FUNC-LABEL: {{^}}load_const_i8_sext:
+; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; R600: 8
+; SI: buffer_load_sbyte v{{[0-9]+}},
+define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+entry:
+  %0 = load i8, i8 addrspace(2)* %in
+  %1 = sext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an aligned i8 value
+; FUNC-LABEL: {{^}}load_const_i8_aligned:
+; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ubyte v{{[0-9]+}},
+define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+entry:
+  %0 = load i8, i8 addrspace(2)* %in
+  %1 = zext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an un-aligned i8 value
+; FUNC-LABEL: {{^}}load_const_i8_unaligned:
+; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ubyte v{{[0-9]+}},
+define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+entry:
+  %0 = getelementptr i8, i8 addrspace(2)* %in, i32 1
+  %1 = load i8, i8 addrspace(2)* %0
+  %2 = zext i8 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load a sign-extended i16 value
+; FUNC-LABEL: {{^}}load_const_i16_sext:
+; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; R600: 16
+; SI: buffer_load_sshort
+define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
+entry:
+  %0 = load i16, i16 addrspace(2)* %in
+  %1 = sext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an aligned i16 value
+; FUNC-LABEL: {{^}}load_const_i16_aligned:
+; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ushort
+define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
+entry:
+  %0 = load i16, i16 addrspace(2)* %in
+  %1 = zext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an un-aligned i16 value
+; FUNC-LABEL: {{^}}load_const_i16_unaligned:
+; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ushort
+define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
+entry:
+  %0 = getelementptr i16, i16 addrspace(2)* %in, i32 1
+  %1 = load i16, i16 addrspace(2)* %0
+  %2 = zext i16 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an i32 value from the constant address space.
+; FUNC-LABEL: {{^}}load_const_addrspace_i32:
+; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+
+; SI: s_load_dword s{{[0-9]+}}
+define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(2)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load a f32 value from the constant address space.
+; FUNC-LABEL: {{^}}load_const_addrspace_f32:
+; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+
+; SI: s_load_dword s{{[0-9]+}}
+define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) {
+  %1 = load float, float addrspace(2)* %in
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+;===------------------------------------------------------------------------===;
+; LOCAL ADDRESS SPACE
+;===------------------------------------------------------------------------===;
+
+; Load an i8 value from the local address space.
+; FUNC-LABEL: {{^}}load_i8_local:
+; R600: LDS_UBYTE_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u8
+define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
+  %1 = load i8, i8 addrspace(3)* %in
+  %2 = zext i8 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i8_sext_local:
+; R600: LDS_UBYTE_READ_RET
+; R600: BFE_INT
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i8
+define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
+entry:
+  %0 = load i8, i8 addrspace(3)* %in
+  %1 = sext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i8_local:
+; R600: LDS_UBYTE_READ_RET
+; R600: LDS_UBYTE_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u8
+; SI: ds_read_u8
+define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in
+  %1 = zext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i8_sext_local:
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i8
+; SI: ds_read_i8
+define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in
+  %1 = sext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i8_local:
+; R600: LDS_UBYTE_READ_RET
+; R600: LDS_UBYTE_READ_RET
+; R600: LDS_UBYTE_READ_RET
+; R600: LDS_UBYTE_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in
+  %1 = zext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i8_sext_local:
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i8
+; SI: ds_read_i8
+; SI: ds_read_i8
+; SI: ds_read_i8
+define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in
+  %1 = sext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; Load an i16 value from the local address space.
+; FUNC-LABEL: {{^}}load_i16_local:
+; R600: LDS_USHORT_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u16
+define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
+entry:
+  %0 = load i16	, i16	 addrspace(3)* %in
+  %1 = zext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i16_sext_local:
+; R600: LDS_USHORT_READ_RET
+; R600: BFE_INT
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i16
+define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
+entry:
+  %0 = load i16, i16 addrspace(3)* %in
+  %1 = sext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i16_local:
+; R600: LDS_USHORT_READ_RET
+; R600: LDS_USHORT_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u16
+; SI: ds_read_u16
+define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in
+  %1 = zext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i16_sext_local:
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i16
+; SI: ds_read_i16
+define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in
+  %1 = sext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i16_local:
+; R600: LDS_USHORT_READ_RET
+; R600: LDS_USHORT_READ_RET
+; R600: LDS_USHORT_READ_RET
+; R600: LDS_USHORT_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in
+  %1 = zext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i16_sext_local:
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i16
+; SI: ds_read_i16
+; SI: ds_read_i16
+; SI: ds_read_i16
+define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in
+  %1 = sext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; load an i32 value from the local address space.
+; FUNC-LABEL: {{^}}load_i32_local:
+; R600: LDS_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_b32
+define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(3)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; load a f32 value from the local address space.
+; FUNC-LABEL: {{^}}load_f32_local:
+; R600: LDS_READ_RET
+; SI: s_mov_b32 m0
+; SI: ds_read_b32
+define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) {
+entry:
+  %0 = load float, float addrspace(3)* %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; load a v2f32 value from the local address space
+; FUNC-LABEL: {{^}}load_v2f32_local:
+; R600: LDS_READ_RET
+; R600: LDS_READ_RET
+; SI: s_mov_b32 m0
+; SI: ds_read_b64
+define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x float>, <2 x float> addrspace(3)* %in
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; Test loading a i32 and v2i32 value from the same base pointer.
+; FUNC-LABEL: {{^}}load_i32_v2i32_local:
+; R600: LDS_READ_RET
+; R600: LDS_READ_RET
+; R600: LDS_READ_RET
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_read2_b32
+define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) {
+  %scalar = load i32, i32 addrspace(3)* %in
+  %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
+  %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
+  %vec0 = load <2 x i32>, <2 x i32> addrspace(3)* %vec_ptr, align 4
+  %vec1 = insertelement <2 x i32> <i32 0, i32 0>, i32 %scalar, i32 0
+  %vec = add <2 x i32> %vec0, %vec1
+  store <2 x i32> %vec, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+
+@lds = addrspace(3) global [512 x i32] undef, align 4
+
+; On SI we need to make sure that the base offset is a register and not
+; an immediate.
+; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
+; SI: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
+; SI: ds_read_b32 v0, v[[ZERO]] offset:4
+; R600: LDS_READ_RET
+define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
+  %tmp1 = load i32, i32 addrspace(3)* %tmp0
+  %tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  store i32 %tmp1, i32 addrspace(1)* %tmp2
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/load.vec.ll b/test/CodeGen/AMDGPU/load.vec.ll
new file mode 100644
index 00000000000..02f883cd8e9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load.vec.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG  %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI  %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI  %s
+
+; load a v2i32 value from the global address space.
+; EG: {{^}}load_v2i32:
+; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0
+; SI: {{^}}load_v2i32:
+; SI: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
+define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  store <2 x i32> %a, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; load a v4i32 value from the global address space.
+; EG: {{^}}load_v4i32:
+; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0
+; SI: {{^}}load_v4i32:
+; SI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}]
+define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  store <4 x i32> %a, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/load64.ll b/test/CodeGen/AMDGPU/load64.ll
new file mode 100644
index 00000000000..74beabdc007
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load64.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; load a f64 value from the global address space.
+; CHECK-LABEL: {{^}}load_f64:
+; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
+; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
+define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+  %1 = load double, double addrspace(1)* %in
+  store double %1, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}load_i64:
+; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
+; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
+define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tmp = load i64, i64 addrspace(1)* %in
+  store i64 %tmp, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Load a f64 value from the constant address space.
+; CHECK-LABEL: {{^}}load_const_addrspace_f64:
+; CHECK: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
+; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
+define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) {
+  %1 = load double, double addrspace(2)* %in
+  store double %1, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/local-64.ll b/test/CodeGen/AMDGPU/local-64.ll
new file mode 100644
index 00000000000..33f3159d13e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-64.ll
@@ -0,0 +1,167 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
+
+; BOTH-LABEL: {{^}}local_i32_load
+; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
+; BOTH: buffer_store_dword [[REG]],
+define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %val = load i32, i32 addrspace(3)* %gep, align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_i32_load_0_offset
+; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
+; BOTH: buffer_store_dword [[REG]],
+define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
+  %val = load i32, i32 addrspace(3)* %in, align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset:
+; BOTH-NOT: ADD
+; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
+; BOTH: buffer_store_byte [[REG]],
+define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
+  %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535
+  %val = load i8, i8 addrspace(3)* %gep, align 4
+  store i8 %val, i8 addrspace(1)* %out, align 4
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_i8_load_over_i16_max_offset:
+; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
+; SI, which is why it is being OR'd with the base pointer.
+; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
+; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
+; BOTH: buffer_store_byte [[REG]],
+define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
+  %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536
+  %val = load i8, i8 addrspace(3)* %gep, align 4
+  store i8 %val, i8 addrspace(1)* %out, align 4
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_i64_load:
+; BOTH-NOT: ADD
+; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
+; BOTH: buffer_store_dwordx2 [[REG]],
+define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7
+  %val = load i64, i64 addrspace(3)* %gep, align 8
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_i64_load_0_offset
+; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
+; BOTH: buffer_store_dwordx2 [[REG]],
+define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
+  %val = load i64, i64 addrspace(3)* %in, align 8
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_f64_load:
+; BOTH-NOT: ADD
+; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
+; BOTH: buffer_store_dwordx2 [[REG]],
+define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
+  %gep = getelementptr double, double addrspace(3)* %in, i32 7
+  %val = load double, double addrspace(3)* %gep, align 8
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_f64_load_0_offset
+; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
+; BOTH: buffer_store_dwordx2 [[REG]],
+define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
+  %val = load double, double addrspace(3)* %in, align 8
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_i64_store:
+; BOTH-NOT: ADD
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
+define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7
+  store i64 5678, i64 addrspace(3)* %gep, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_i64_store_0_offset:
+; BOTH-NOT: ADD
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
+  store i64 1234, i64 addrspace(3)* %out, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_f64_store:
+; BOTH-NOT: ADD
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
+define void @local_f64_store(double addrspace(3)* %out) nounwind {
+  %gep = getelementptr double, double addrspace(3)* %out, i32 7
+  store double 16.0, double addrspace(3)* %gep, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_f64_store_0_offset
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
+  store double 20.0, double addrspace(3)* %out, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_v2i64_store:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120
+; BOTH: s_endpgm
+define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
+  %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
+  store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_v2i64_store_0_offset:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
+; BOTH: s_endpgm
+define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
+  store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_v4i64_store:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248
+; BOTH: s_endpgm
+define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
+  %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
+  store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_v4i64_store_0_offset:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24
+; BOTH: s_endpgm
+define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
+  store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/local-atomics.ll b/test/CodeGen/AMDGPU/local-atomics.ll
new file mode 100644
index 00000000000..2aaf977ab90
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-atomics.ll
@@ -0,0 +1,551 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
+; EG: LDS_WRXCHG_RET *
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32_offset:
+; EG: LDS_WRXCHG_RET *
+; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XXX - Is it really necessary to load 4 into VGPR?
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
+; EG: LDS_ADD_RET *
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_offset:
+; EG: LDS_ADD_RET *
+; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_bad_si_offset:
+; EG: LDS_ADD_RET *
+; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32:
+; EG: LDS_ADD_RET *
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
+define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
+; EG: LDS_ADD_RET *
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
+define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_bad_si_offset:
+; EG: LDS_ADD_RET *
+; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32:
+; EG: LDS_SUB_RET *
+; GCN: ds_sub_rtn_u32
+; GCN: s_endpgm
+define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32_offset:
+; EG: LDS_SUB_RET *
+; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32:
+; EG: LDS_SUB_RET *
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
+define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
+; EG: LDS_SUB_RET *
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
+define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32:
+; EG: LDS_AND_RET *
+; GCN: ds_and_rtn_b32
+; GCN: s_endpgm
+define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32_offset:
+; EG: LDS_AND_RET *
+; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32:
+; EG: LDS_OR_RET *
+; GCN: ds_or_rtn_b32
+; GCN: s_endpgm
+define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32_offset:
+; EG: LDS_OR_RET *
+; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32:
+; EG: LDS_XOR_RET *
+; GCN: ds_xor_rtn_b32
+; GCN: s_endpgm
+define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32_offset:
+; EG: LDS_XOR_RET *
+; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
+;   store i32 %result, i32 addrspace(1)* %out, align 4
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32:
+; EG: LDS_MIN_INT_RET *
+; GCN: ds_min_rtn_i32
+; GCN: s_endpgm
+define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32_offset:
+; EG: LDS_MIN_INT_RET *
+; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32:
+; EG: LDS_MAX_INT_RET *
+; GCN: ds_max_rtn_i32
+; GCN: s_endpgm
+define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32_offset:
+; EG: LDS_MAX_INT_RET *
+; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32:
+; EG: LDS_MIN_UINT_RET *
+; GCN: ds_min_rtn_u32
+; GCN: s_endpgm
+define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32_offset:
+; EG: LDS_MIN_UINT_RET *
+; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32:
+; EG: LDS_MAX_UINT_RET *
+; GCN: ds_max_rtn_u32
+; GCN: s_endpgm
+define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32_offset:
+; EG: LDS_MAX_UINT_RET *
+; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32:
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
+; GCN: s_endpgm
+define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset:
+; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; XXX - Is it really necessary to load 4 into VGPR?
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32:
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_add_u32 [[VPTR]], [[DATA]]
+; GCN: s_endpgm
+define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset:
+; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_bad_si_offset
+; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32:
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
+define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
+define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_bad_si_offset:
+; SI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_inc_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32:
+; GCN: ds_sub_u32
+; GCN: s_endpgm
+define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset:
+; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32:
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_u32  v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
+define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
+define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32:
+; GCN: ds_and_b32
+; GCN: s_endpgm
+define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset:
+; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32:
+; GCN: ds_or_b32
+; GCN: s_endpgm
+define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset:
+; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32:
+; GCN: ds_xor_b32
+; GCN: s_endpgm
+define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset:
+; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32:
+; GCN: ds_min_i32
+; GCN: s_endpgm
+define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset:
+; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32:
+; GCN: ds_max_i32
+; GCN: s_endpgm
+define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset:
+; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32:
+; GCN: ds_min_u32
+; GCN: s_endpgm
+define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset:
+; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32:
+; GCN: ds_max_u32
+; GCN: s_endpgm
+define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset:
+; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/local-atomics64.ll b/test/CodeGen/AMDGPU/local-atomics64.ll
new file mode 100644
index 00000000000..0ffa5e751b7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-atomics64.ll
@@ -0,0 +1,470 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64:
+; GCN: ds_wrxchg_rtn_b64
+; GCN: s_endpgm
+define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
+; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64:
+; GCN: ds_add_rtn_u64
+; GCN: s_endpgm
+define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
+define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
+define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64_offset:
+; GCN: ds_inc_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64:
+; GCN: ds_sub_rtn_u64
+; GCN: s_endpgm
+define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
+; GCN: ds_sub_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
+define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64_offset:
+; GCN: ds_dec_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64:
+; GCN: ds_and_rtn_b64
+; GCN: s_endpgm
+define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
+; GCN: ds_and_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64:
+; GCN: ds_or_rtn_b64
+; GCN: s_endpgm
+define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
+; GCN: ds_or_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64:
+; GCN: ds_xor_rtn_b64
+; GCN: s_endpgm
+define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
+; GCN: ds_xor_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64:
+; GCN: ds_min_rtn_i64
+; GCN: s_endpgm
+define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
+; GCN: ds_min_rtn_i64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64:
+; GCN: ds_max_rtn_i64
+; GCN: s_endpgm
+define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
+; GCN: ds_max_rtn_i64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64:
+; GCN: ds_min_rtn_u64
+; GCN: s_endpgm
+define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
+; GCN: ds_min_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64:
+; GCN: ds_max_rtn_u64
+; GCN: s_endpgm
+define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
+; GCN: ds_max_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64:
+; GCN: ds_wrxchg_rtn_b64
+; GCN: s_endpgm
+define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
+; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64:
+; GCN: ds_add_u64
+; GCN: s_endpgm
+define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: s_endpgm
+define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset:
+; GCN: ds_inc_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64:
+; GCN: ds_sub_u64
+; GCN: s_endpgm
+define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
+; GCN: ds_sub_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: s_endpgm
+define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset:
+; GCN: ds_dec_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64:
+; GCN: ds_and_b64
+; GCN: s_endpgm
+define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
+; GCN: ds_and_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64:
+; GCN: ds_or_b64
+; GCN: s_endpgm
+define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
+; GCN: ds_or_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64:
+; GCN: ds_xor_b64
+; GCN: s_endpgm
+define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
+; GCN: ds_xor_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64:
+; GCN: ds_min_i64
+; GCN: s_endpgm
+define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
+; GCN: ds_min_i64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64:
+; GCN: ds_max_i64
+; GCN: s_endpgm
+define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
+; GCN: ds_max_i64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64:
+; GCN: ds_min_u64
+; GCN: s_endpgm
+define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
+; GCN: ds_min_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64:
+; GCN: ds_max_u64
+; GCN: s_endpgm
+define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
+; GCN: ds_max_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/test/CodeGen/AMDGPU/local-memory-two-objects.ll
new file mode 100644
index 00000000000..06a8b1246e6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-memory-two-objects.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=CI %s
+
+@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+
+
+; Check that the LDS size emitted correctly
+; EG: .long 166120
+; EG-NEXT: .long 8
+; GCN: .long 47180
+; GCN-NEXT: .long 38792
+
+; EG: {{^}}local_memory_two_objects:
+
+; We would like to check the the lds writes are using different
+; addresses, but due to variations in the scheduler, we can't do
+; this consistently on evergreen GPUs.
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+; GCN: ds_write_b32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
+; GCN-NOT: ds_write_b32 {{v[0-9]*}}, v[[ADDRW]]
+
+; GROUP_BARRIER must be the last instruction in a clause
+; EG: GROUP_BARRIER
+; EG-NEXT: ALU clause
+
+; Make sure the lds reads are using different addresses, at different
+; constant offsets.
+; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
+; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
+; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]]
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]]
+
+define void @local_memory_two_objects(i32 addrspace(1)* %out) {
+entry:
+  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
+  store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
+  %mul = shl nsw i32 %x.i, 1
+  %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
+  store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
+  %sub = sub nsw i32 3, %x.i
+  call void @llvm.AMDGPU.barrier.local()
+  %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
+  %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
+  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
+  %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
+  %add = add nsw i32 %x.i, 4
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
+  store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare void @llvm.AMDGPU.barrier.local()
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll
new file mode 100644
index 00000000000..9494ed75bd0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-memory.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
+
+
+; Check that the LDS size emitted correctly
+; EG: .long 166120
+; EG-NEXT: .long 128
+; SI: .long 47180
+; SI-NEXT: .long 71560
+; CI: .long 47180
+; CI-NEXT: .long 38792
+
+; FUNC-LABEL: {{^}}local_memory:
+
+; EG: LDS_WRITE
+; SI-NOT: s_wqm_b64
+; SI: ds_write_b32
+
+; GROUP_BARRIER must be the last instruction in a clause
+; EG: GROUP_BARRIER
+; EG-NEXT: ALU clause
+; SI: s_barrier
+
+; EG: LDS_READ_RET
+; SI: ds_read_b32 {{v[0-9]+}},
+
+define void @local_memory(i32 addrspace(1)* %out) {
+entry:
+  %y.i = call i32 @llvm.r600.read.tidig.x() #0
+  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
+  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
+  %add = add nsw i32 %y.i, 1
+  %cmp = icmp eq i32 %add, 16
+  %.add = select i1 %cmp, i32 0, i32 %add
+  call void @llvm.AMDGPU.barrier.local()
+  %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
+  %0 = load i32, i32 addrspace(3)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
+  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare void @llvm.AMDGPU.barrier.local()
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/loop-address.ll b/test/CodeGen/AMDGPU/loop-address.ll
new file mode 100644
index 00000000000..f60d574497d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/loop-address.ll
@@ -0,0 +1,34 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood < %s | FileCheck %s
+
+;CHECK: ALU_PUSH
+;CHECK: LOOP_START_DX10 @11
+;CHECK: LOOP_BREAK @10
+;CHECK: POP @10
+
+define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 {
+entry:
+  %cmp5 = icmp sgt i32 %iterations, 0
+  br i1 %cmp5, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ]
+  %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %i.07 = add nsw i32 %i.07.in, -1
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06
+  store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4
+  %add = add nsw i32 %ai.06, 1
+  %exitcond = icmp eq i32 %add, %iterations
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+
+!opencl.kernels = !{!0, !1, !2, !3}
+
+!0 = !{void (i32 addrspace(1)*, i32)* @loop_ge}
+!1 = !{null}
+!2 = !{null}
+!3 = !{null}
diff --git a/test/CodeGen/AMDGPU/loop-idiom.ll b/test/CodeGen/AMDGPU/loop-idiom.ll
new file mode 100644
index 00000000000..5fd9806813c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/loop-idiom.ll
@@ -0,0 +1,51 @@
+; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
+; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+
+; Make sure loop-idiom doesn't create memcpy or memset.  There are no library
+; implementations of these for R600.
+
+; FUNC: @no_memcpy
+; R600-NOT: {{^}}llvm.memcpy
+; SI-NOT: {{^}}llvm.memcpy
+define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
+entry:
+  %dest = alloca i8, i32 32
+  br label %for.body
+
+for.body:
+  %0 = phi i32 [0, %entry], [%4, %for.body]
+  %1 = getelementptr i8, i8 addrspace(3)* %in, i32 %0
+  %2 = getelementptr i8, i8* %dest, i32 %0
+  %3 = load i8, i8 addrspace(3)* %1
+  store i8 %3, i8* %2
+  %4 = add i32 %0, 1
+  %5 = icmp eq i32 %4, %size
+  br i1 %5, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; FUNC: @no_memset
+; R600-NOT: {{^}}llvm.memset
+; R600-NOT: {{^}}memset_pattern16:
+; SI-NOT: {{^}}llvm.memset
+; SI-NOT: {{^}}memset_pattern16:
+define void @no_memset(i32 %size) {
+entry:
+  %dest = alloca i8, i32 32
+  br label %for.body
+
+for.body:
+  %0 = phi i32 [0, %entry], [%2, %for.body]
+  %1 = getelementptr i8, i8* %dest, i32 %0
+  store i8 0, i8* %1
+  %2 = add i32 %0, 1
+  %3 = icmp eq i32 %2, %size
+  br i1 %3, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/lshl.ll b/test/CodeGen/AMDGPU/lshl.ll
new file mode 100644
index 00000000000..9ac988d38d1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lshl.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
+
+define void @test(i32 %p) {
+   %i = mul i32 %p, 2
+   %r = bitcast i32 %i to float
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
+   ret void
+}
+
+declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/lshr.ll b/test/CodeGen/AMDGPU/lshr.ll
new file mode 100644
index 00000000000..50e444ac26b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lshr.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
+
+define void @test(i32 %p) {
+   %i = udiv i32 %p, 2
+   %r = bitcast i32 %i to float
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
+   ret void
+}
+
+declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/m0-spill.ll b/test/CodeGen/AMDGPU/m0-spill.ll
new file mode 100644
index 00000000000..1dddc85f775
--- /dev/null
+++ b/test/CodeGen/AMDGPU/m0-spill.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+@lds = external addrspace(3) global [64 x float]
+
+; CHECK-LABEL: {{^}}main:
+; CHECK-NOT: v_readlane_b32 m0
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+main_body:
+  %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
+  %cmp = fcmp ueq float 0.0, %4
+  br i1 %cmp, label %if, label %else
+
+if:
+  %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0
+  %lds_data = load float, float addrspace(3)* %lds_ptr
+  br label %endif
+
+else:
+  %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
+  br label %endif
+
+endif:
+  %export = phi float [%lds_data, %if], [%interp, %else]
+  %5 = call i32 @llvm.SI.packf16(float %export, float %export)
+  %6 = bitcast i32 %5 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6)
+  ret void
+}
+
+declare float @llvm.SI.fs.constant(i32, i32, i32) readnone
+
+declare i32 @llvm.SI.packf16(float, float) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll
new file mode 100644
index 00000000000..bc071628ead
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mad-combine.ll
@@ -0,0 +1,567 @@
+; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
+
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+
+; Make sure we don't form mad with denormals
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.fma.f32(float, float, float) #0
+declare float @llvm.fmuladd.f32(float, float, float) #0
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF-NOT: v_fma
+; SI-DENORM-SLOWFMAF-NOT: v_mad
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fadd float %mul, %c
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %fma0 = fadd float %mul, %c
+  %fma1 = fadd float %mul, %d
+
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fadd x, (fmul y, z)) -> (fma y, z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fadd float %c, %mul
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fsub float %mul, %c
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %fma0 = fsub float %mul, %c
+  %fma1 = fsub float %mul, %d
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fsub float %c, %mul
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %fma0 = fsub float %c, %mul
+  %fma1 = fsub float %d, %mul
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
+
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %mul.neg = fsub float -0.0, %mul
+  %fma = fsub float %mul.neg, %c
+
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %mul.neg = fsub float -0.0, %mul
+  %fma0 = fsub float %mul.neg, %c
+  %fma1 = fsub float %mul.neg, %d
+
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %mul.neg = fsub float -0.0, %mul
+  %fma0 = fsub float %mul.neg, %c
+  %fma1 = fsub float %mul, %d
+
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
+
+; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %z = load float, float addrspace(1)* %gep.2
+  %u = load float, float addrspace(1)* %gep.3
+  %v = load float, float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
+  %tmp2 = fsub float %tmp1, %z
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub x, (fma y, z, (fmul u, v)))
+;   -> (fma (fneg y), z, (fma (fneg u), v, x))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
+; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+
+; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %z = load float, float addrspace(1)* %gep.2
+  %u = load float, float addrspace(1)* %gep.3
+  %v = load float, float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
+  %tmp2 = fsub float %x, %tmp1
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
+
+; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %z = load float, float addrspace(1)* %gep.2
+  %u = load float, float addrspace(1)* %gep.3
+  %v = load float, float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
+  %tmp2 = fsub float %tmp1, %z
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub x, (fmuladd y, z, (fmul u, v)))
+;   -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
+
+; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %z = load float, float addrspace(1)* %gep.2
+  %u = load float, float addrspace(1)* %gep.3
+  %v = load float, float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
+  %tmp2 = fsub float %x, %tmp1
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/mad-sub.ll b/test/CodeGen/AMDGPU/mad-sub.ll
new file mode 100644
index 00000000000..aa4194ff610
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mad-sub.ll
@@ -0,0 +1,215 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare float @llvm.fabs.f32(float) #0
+
+; FUNC-LABEL: {{^}}mad_sub_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
+  %mul = fmul float %a, %b
+  %sub = fsub float %mul, %c
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_inv_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
+  %mul = fmul float %a, %b
+  %sub = fsub float %c, %mul
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_f64:
+; SI: v_mul_f64
+; SI: v_add_f64
+define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr double, double addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load double, double addrspace(1)* %gep0, align 8
+  %b = load double, double addrspace(1)* %gep1, align 8
+  %c = load double, double addrspace(1)* %gep2, align 8
+  %mul = fmul double %a, %b
+  %sub = fsub double %mul, %c
+  store double %sub, double addrspace(1)* %outgep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_fabs_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
+  %c.abs = call float @llvm.fabs.f32(float %c) #0
+  %mul = fmul float %a, %b
+  %sub = fsub float %mul, %c.abs
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_fabs_inv_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
+  %c.abs = call float @llvm.fabs.f32(float %c) #0
+  %mul = fmul float %a, %b
+  %sub = fsub float %c.abs, %mul
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}neg_neg_mad_f32:
+; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
+  %nega = fsub float -0.000000e+00, %a
+  %negb = fsub float -0.000000e+00, %b
+  %mul = fmul float %nega, %negb
+  %sub = fadd float %mul, %c
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_fabs_sub_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
+  %b.abs = call float @llvm.fabs.f32(float %b) #0
+  %mul = fmul float %a, %b.abs
+  %sub = fsub float %mul, %c
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fsub_c_fadd_a_a:
+; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %add = fadd float %r1, %r1
+  %r3 = fsub float %r2, %add
+
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fsub_fadd_a_a_c:
+; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fsub_fadd_a_a_c(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %add = fadd float %r1, %r1
+  %r3 = fsub float %add, %r2
+
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/mad_int24.ll b/test/CodeGen/AMDGPU/mad_int24.ll
new file mode 100644
index 00000000000..86d75a63ca4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mad_int24.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}i32_mad24:
+; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
+; EG: MULLO_INT
+; Make sure we aren't masking the inputs.
+; CM-NOT: AND
+; CM: MULADD_INT24
+; SI-NOT: and
+; SI: v_mad_i32_i24
+define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = shl i32 %a, 8
+  %a_24 = ashr i32 %0, 8
+  %1 = shl i32 %b, 8
+  %b_24 = ashr i32 %1, 8
+  %2 = mul i32 %a_24, %b_24
+  %3 = add i32 %2, %c
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @test_imul24
+; SI: v_mad_i32_i24
+define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
+  %add = add i32 %mul, %src2
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/mad_uint24.ll b/test/CodeGen/AMDGPU/mad_uint24.ll
new file mode 100644
index 00000000000..95fe3411959
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -0,0 +1,76 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+; FUNC-LABEL: {{^}}u32_mad24:
+; EG: MULADD_UINT24
+; SI: v_mad_u32_u24
+
+define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = shl i32 %a, 8
+  %a_24 = lshr i32 %0, 8
+  %1 = shl i32 %b, 8
+  %b_24 = lshr i32 %1, 8
+  %2 = mul i32 %a_24, %b_24
+  %3 = add i32 %2, %c
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i16_mad24:
+; The order of A and B does not matter.
+; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
+; The result must be sign-extended
+; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
+; EG: 16
+; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
+
+define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
+entry:
+  %0 = mul i16 %a, %b
+  %1 = add i16 %0, %c
+  %2 = sext i16 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i8_mad24:
+; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
+; The result must be sign-extended
+; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
+; EG: 8
+; SI: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
+
+define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
+entry:
+  %0 = mul i8 %a, %b
+  %1 = add i8 %0, %c
+  %2 = sext i8 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; This tests for a bug where the mad_u24 pattern matcher would call
+; SimplifyDemandedBits on the first operand of the mul instruction
+; assuming that the pattern would be matched to a 24-bit mad.  This
+; led to some instructions being incorrectly erased when the entire
+; 24-bit mad pattern wasn't being matched.
+
+; Check that the select instruction is not deleted.
+; FUNC-LABEL: {{^}}i24_i32_i32_mad:
+; EG: CNDE_INT
+; SI: v_cndmask
+define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  %0 = ashr i32 %a, 8
+  %1 = icmp ne i32 %c, 0
+  %2 = select i1 %1, i32 %0, i32 34
+  %3 = mul i32 %2, %c
+  %4 = add i32 %3, %d
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
new file mode 100644
index 00000000000..933bb016d2c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -0,0 +1,193 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+; FIXME: Enable VI
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; GCN-LABEL: {{^}}madak_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000
+define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Make sure this is only folded with one use. This is a code size
+; optimization and if we fold the immediate multiple times, we'll undo
+; it.
+
+; GCN-LABEL: {{^}}madak_2_use_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]]
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VC]], [[VK]]
+; GCN: s_endpgm
+define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2
+
+  %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %in.gep.0, align 4
+  %b = load float, float addrspace(1)* %in.gep.1, align 4
+  %c = load float, float addrspace(1)* %in.gep.2, align 4
+
+  %mul0 = fmul float %a, %b
+  %mul1 = fmul float %a, %c
+  %madak0 = fadd float %mul0, 10.0
+  %madak1 = fadd float %mul1, 10.0
+
+  store float %madak0, float addrspace(1)* %out.gep.0, align 4
+  store float %madak1, float addrspace(1)* %out.gep.1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
+define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+
+  %mul = fmul float 4.0, %a
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Make sure nothing weird happens with a value that is also allowed as
+; an inline immediate.
+
+; GCN-LABEL: {{^}}madak_inline_imm_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
+define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 4.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; We can't use an SGPR when forming madak
+; GCN-LABEL: {{^}}s_v_madak_f32:
+; GCN: s_load_dword [[SB:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
+; GCN-NOT: v_madak_f32
+; GCN: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]]
+define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: @v_s_madak_f32
+; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
+; GCN-NOT: v_madak_f32
+; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]]
+define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_s_madak_f32:
+; GCN-NOT: v_madak_f32
+; GCN: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
+; GCN: s_endpgm
+define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
+
+  %mul = fmul float %a.fabs, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
+; GCN: s_endpgm
+define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
+
+  %mul = fmul float %a, %b.fabs
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll
new file mode 100644
index 00000000000..ba7bb221a99
--- /dev/null
+++ b/test/CodeGen/AMDGPU/madmk.ll
@@ -0,0 +1,205 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; GCN-LABEL: {{^}}madmk_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_madmk_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
+define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}madmk_2_use_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VB]]
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VC]]
+; GCN: s_endpgm
+define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2
+
+  %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %in.gep.0, align 4
+  %b = load float, float addrspace(1)* %in.gep.1, align 4
+  %c = load float, float addrspace(1)* %in.gep.2, align 4
+
+  %mul0 = fmul float %a, 10.0
+  %mul1 = fmul float %a, 10.0
+  %madmk0 = fadd float %mul0, %b
+  %madmk1 = fadd float %mul1, %c
+
+  store float %madmk0, float addrspace(1)* %out.gep.0, align 4
+  store float %madmk1, float addrspace(1)* %out.gep.1, align 4
+  ret void
+}
+
+; We don't get any benefit if the constant is an inline immediate.
+; GCN-LABEL: {{^}}madmk_inline_imm_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_mad_f32 {{v[0-9]+}}, 4.0, [[VA]], [[VB]]
+define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %mul = fmul float %a, 4.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_s_madmk_f32:
+; GCN-NOT: v_madmk_f32
+; GCN: v_mad_f32
+; GCN: s_endpgm
+define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_s_madmk_f32:
+; GCN-NOT: v_madmk_f32
+; GCN: v_mad_f32
+; GCN: s_endpgm
+define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.0, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_vector_madmk_f32:
+; GCN-NOT: v_madmk_f32
+; GCN: v_mad_f32
+; GCN: s_endpgm
+define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %b = load float, float addrspace(1)* %gep.0, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madmk_src0_modifier_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
+define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
+
+  %mul = fmul float %a.fabs, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madmk_src2_modifier_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}|
+define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b.fabs
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}madmk_add_inline_imm_f32:
+; GCN: buffer_load_dword [[A:v[0-9]+]]
+; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0
+define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, 2.0
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}kill_madmk_verifier_error:
+; SI: s_xor_b64
+; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x472aee8c
+; SI: s_or_b64
+define void @kill_madmk_verifier_error() nounwind {
+bb:
+  br label %bb2
+
+bb1:                                              ; preds = %bb2
+  ret void
+
+bb2:                                              ; preds = %bb6, %bb
+  %tmp = phi float [ undef, %bb ], [ %tmp8, %bb6 ]
+  %tmp3 = fsub float undef, %tmp
+  %tmp5 = fcmp oeq float %tmp3, 1.000000e+04
+  br i1 %tmp5, label %bb1, label %bb6
+
+bb6:                                              ; preds = %bb2
+  %tmp4 = fmul float %tmp, undef
+  %tmp7 = fmul float %tmp4, 0x40E55DD180000000
+  %tmp8 = fadd float %tmp7, undef
+  br label %bb2
+}
diff --git a/test/CodeGen/AMDGPU/max-literals.ll b/test/CodeGen/AMDGPU/max-literals.ll
new file mode 100644
index 00000000000..c357524b140
--- /dev/null
+++ b/test/CodeGen/AMDGPU/max-literals.ll
@@ -0,0 +1,67 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: ADD *
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
+  %5 = fadd float %0, 2.0
+  %6 = fadd float %1, 3.0
+  %7 = fadd float %2, 4.0
+  %8 = fadd float %3, 5.0
+  %9 = bitcast float %4 to i32
+  %10 = mul i32 %9, 6
+  %11 = bitcast i32 %10 to float
+  %12 = insertelement <4 x float> undef, float %5, i32 0
+  %13 = insertelement <4 x float> %12, float %6, i32 1
+  %14 = insertelement <4 x float> %13, float %7, i32 2
+  %15 = insertelement <4 x float> %14, float %8, i32 3
+  %16 = insertelement <4 x float> %15, float %11, i32 3
+
+  %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
+  %18 = insertelement <4 x float> undef, float %17, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}main2:
+; CHECK-NOT: ADD *
+
+define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
+  %5 = fadd float %0, 2.0
+  %6 = fadd float %1, 3.0
+  %7 = fadd float %2, 4.0
+  %8 = fadd float %3, 2.0
+  %9 = bitcast float %4 to i32
+  %10 = mul i32 %9, 6
+  %11 = bitcast i32 %10 to float
+  %12 = insertelement <4 x float> undef, float %5, i32 0
+  %13 = insertelement <4 x float> %12, float %6, i32 1
+  %14 = insertelement <4 x float> %13, float %7, i32 2
+  %15 = insertelement <4 x float> %14, float %8, i32 3
+  %16 = insertelement <4 x float> %15, float %11, i32 3
+
+  %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
+  %18 = insertelement <4 x float> undef, float %17, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll
new file mode 100644
index 00000000000..fef3e2f0a21
--- /dev/null
+++ b/test/CodeGen/AMDGPU/max.ll
@@ -0,0 +1,168 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imax_sge_i32
+; SI: v_max_i32_e32
+define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp sge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imax_sge_i32
+; SI: s_max_i32
+define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_imax_sge_imm_i32:
+; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
+define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %cmp = icmp sge i32 %a, 9
+  %val = select i1 %cmp, i32 %a, i32 9
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_i32:
+; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
+define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %cmp = icmp sgt i32 %a, 9
+  %val = select i1 %cmp, i32 %a, i32 9
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_imax_sgt_i32
+; SI: v_max_i32_e32
+define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp sgt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imax_sgt_i32
+; SI: s_max_i32
+define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umax_uge_i32
+; SI: v_max_u32_e32
+define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp uge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umax_uge_i32
+; SI: s_max_u32
+define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umax_ugt_i32
+; SI: v_max_u32_e32
+define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ugt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umax_ugt_i32
+; SI: s_max_u32
+define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Make sure redundant and removed
+; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_max_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
+; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; SI-NEXT: buffer_store_dword [[VMIN]]
+define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
+  %a.ext = zext i16 %a to i32
+  %b.ext = zext i16 %b to i32
+  %cmp = icmp ugt i32 %a.ext, %b.ext
+  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
+  %mask = and i32 %val, 65535
+  store i32 %mask, i32 addrspace(1)* %out
+  ret void
+}
+
+; Make sure redundant sign_extend_inreg removed.
+
+; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_max_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
+; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; SI-NEXT: buffer_store_dword [[VMIN]]
+define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
+  %a.ext = sext i16 %a to i32
+  %b.ext = sext i16 %b to i32
+  %cmp = icmp sgt i32 %a.ext, %b.ext
+  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
+  %shl = shl i32 %val, 16
+  %sextinreg = ashr i32 %shl, 16
+  store i32 %sextinreg, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should get match min/max through extends inserted by
+; legalization.
+
+; FUNC-LABEL: {{^}}s_test_imin_sge_i16:
+; SI: s_sext_i32_i16
+; SI: s_sext_i32_i16
+; SI: v_cmp_ge_i32_e32
+; SI: v_cndmask_b32
+define void @s_test_imin_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
+  %cmp = icmp sge i16 %a, %b
+  %val = select i1 %cmp, i16 %a, i16 %b
+  store i16 %val, i16 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/max3.ll b/test/CodeGen/AMDGPU/max3.ll
new file mode 100644
index 00000000000..cfb94b272e5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/max3.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imax3_sgt_i32
+; SI: v_max3_i32
+define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp sgt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp sgt i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umax3_ugt_i32
+; SI: v_max3_u32
+define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp ugt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp ugt i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll
new file mode 100644
index 00000000000..dbf9d4481ff
--- /dev/null
+++ b/test/CodeGen/AMDGPU/merge-stores.ll
@@ -0,0 +1,536 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+
+; Run with devices with different unaligned load restrictions.
+
+; TODO: Vector element tests
+; TODO: Non-zero base offset for load and store combinations
+; TODO: Same base addrspacecasted
+
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: s_endpgm
+define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+
+  store i8 123, i8 addrspace(1)* %out.gep.1
+  store i8 456, i8 addrspace(1)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: s_endpgm
+define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+
+  store i8 123, i8 addrspace(1)* %out.gep.1
+  store i8 456, i8 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
+; GCN: buffer_store_dword v
+define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+
+  store i16 123, i16 addrspace(1)* %out.gep.1
+  store i16 456, i16 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
+; GCN: buffer_store_dword v
+define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+
+  store i16 0, i16 addrspace(1)* %out.gep.1
+  store i16 0, i16 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: s_endpgm
+define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+
+  store i16 123, i16 addrspace(1)* %out.gep.1
+  store i16 456, i16 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
+; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
+; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
+; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
+; GCN: buffer_store_dwordx2
+define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
+  store float 1.0, float addrspace(1)* %out.gep.1.bc
+  store i32 456, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
+; GCN: buffer_store_dwordx2
+define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
+  store i32 123, i32 addrspace(1)* %out.gep.1.bc
+  store float 4.0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
+; GCN: buffer_store_dwordx4
+define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out.gep.2
+  store i32 333, i32 addrspace(1)* %out.gep.3
+  store i32 1234, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
+; XGCN: buffer_store_dwordx4
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dwordx2 v
+define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+
+  store float 8.0, float addrspace(1)* %out
+  store float 1.0, float addrspace(1)* %out.gep.1
+  store float 2.0, float addrspace(1)* %out.gep.2
+  store float 4.0, float addrspace(1)* %out.gep.3
+  ret void
+}
+
+; First store is out of order. Because of order of combines, the
+; consecutive store fails because only some of the stores have been
+; replaced with integer constant stores, and then won't merge because
+; the types are different.
+
+; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
+; XGCN: buffer_store_dwordx4
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+
+  store float 1.0, float addrspace(1)* %out.gep.1
+  store float 2.0, float addrspace(1)* %out.gep.2
+  store float 4.0, float addrspace(1)* %out.gep.3
+  store float 8.0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dword
+; SI-NOT: buffer_store_dword
+; GCN: s_endpgm
+define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out.gep.2
+  store i32 1234, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
+; XGCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
+
+  store i64 123, i64 addrspace(1)* %out.gep.1
+  store i64 456, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
+; XGCN: buffer_store_dwordx4
+; XGCN: buffer_store_dwordx4
+
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
+  %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
+  %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
+
+  store i64 123, i64 addrspace(1)* %out.gep.1
+  store i64 456, i64 addrspace(1)* %out.gep.2
+  store i64 333, i64 addrspace(1)* %out.gep.3
+  store i64 1234, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
+; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; GCN: buffer_store_dwordx2 [[LOAD]]
+define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+
+  %lo = load i32, i32 addrspace(1)* %in
+  %hi = load i32, i32 addrspace(1)* %in.gep.1
+
+  store i32 %lo, i32 addrspace(1)* %out
+  store i32 %hi, i32 addrspace(1)* %out.gep.1
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
+; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %lo = load i32, i32 addrspace(1)* %in.gep.0
+  %hi = load i32, i32 addrspace(1)* %in.gep.1
+
+  store i32 %lo, i32 addrspace(1)* %out.gep.0
+  store i32 %hi, i32 addrspace(1)* %out.gep.1
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
+; GCN: buffer_load_dword v
+; GCN: buffer_load_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+
+  %lo = load i32, i32 addrspace(1)* %in
+  %hi = load i32, i32 addrspace(1)* %in.gep.1
+
+  store i32 %hi, i32 addrspace(1)* %out
+  store i32 %lo, i32 addrspace(1)* %out.gep.1
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; GCN: buffer_store_dwordx4 [[LOAD]]
+define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  store i32 %x, i32 addrspace(1)* %out
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
+; SI-DAG: buffer_load_dwordx2
+; SI-DAG: buffer_load_dword v
+; GCN: s_waitcnt
+; SI-DAG: buffer_store_dword v
+; SI-DAG: buffer_store_dwordx2 v
+; GCN: s_endpgm
+define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+
+  store i32 %x, i32 addrspace(1)* %out
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; GCN: buffer_store_dwordx4 [[LOAD]]
+define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
+
+  %x = load float, float addrspace(1)* %in
+  %y = load float, float addrspace(1)* %in.gep.1
+  %z = load float, float addrspace(1)* %in.gep.2
+  %w = load float, float addrspace(1)* %in.gep.3
+
+  store float %x, float addrspace(1)* %out
+  store float %y, float addrspace(1)* %out.gep.1
+  store float %z, float addrspace(1)* %out.gep.2
+  store float %w, float addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
+define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
+  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
+
+  %x = load i32, i32 addrspace(1)* %in.gep.0
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  store i32 %x, i32 addrspace(1)* %out.gep.0
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; GCN: s_barrier
+; GCN: buffer_store_dwordx4 [[LOAD]]
+define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  ; Make sure the barrier doesn't stop this
+  tail call void @llvm.AMDGPU.barrier.local() #1
+
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %x, i32 addrspace(1)* %out
+
+  ret void
+}
+
+; TODO: Re-packing of loaded register required. Maybe an IR pass
+; should catch this?
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
+; GCN: buffer_load_dword v
+; GCN: buffer_load_dword v
+; GCN: buffer_load_dword v
+; GCN: buffer_load_dword v
+; GCN: s_barrier
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  ; Make sure the barrier doesn't stop this
+  tail call void @llvm.AMDGPU.barrier.local() #1
+
+  store i32 %w, i32 addrspace(1)* %out
+  store i32 %z, i32 addrspace(1)* %out.gep.1
+  store i32 %y, i32 addrspace(1)* %out.gep.2
+  store i32 %x, i32 addrspace(1)* %out.gep.3
+
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
+; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
+; GCN: buffer_store_dword [[LOAD]]
+; GCN: s_endpgm
+define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
+  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
+  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
+  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
+
+  %x = load i8, i8 addrspace(1)* %in, align 4
+  %y = load i8, i8 addrspace(1)* %in.gep.1
+  %z = load i8, i8 addrspace(1)* %in.gep.2
+  %w = load i8, i8 addrspace(1)* %in.gep.3
+
+  store i8 %x, i8 addrspace(1)* %out, align 4
+  store i8 %y, i8 addrspace(1)* %out.gep.1
+  store i8 %z, i8 addrspace(1)* %out.gep.2
+  store i8 %w, i8 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: s_endpgm
+define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
+  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
+  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
+  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
+
+  %x = load i8, i8 addrspace(1)* %in
+  %y = load i8, i8 addrspace(1)* %in.gep.1
+  %z = load i8, i8 addrspace(1)* %in.gep.2
+  %w = load i8, i8 addrspace(1)* %in.gep.3
+
+  store i8 %x, i8 addrspace(1)* %out
+  store i8 %y, i8 addrspace(1)* %out.gep.1
+  store i8 %z, i8 addrspace(1)* %out.gep.2
+  store i8 %w, i8 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; This works once AA is enabled on the subtarget
+; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; XGCN: buffer_store_dwordx4 [[LOAD]]
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
+
+  %x = extractelement <4 x i32> %vec, i32 0
+  %y = extractelement <4 x i32> %vec, i32 1
+  %z = extractelement <4 x i32> %vec, i32 2
+  %w = extractelement <4 x i32> %vec, i32 3
+
+  store i32 %x, i32 addrspace(1)* %out
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: s_endpgm
+define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
+
+  store i8 123, i8 addrspace(3)* %out.gep.1
+  store i8 456, i8 addrspace(3)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
+; GCN-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
+; GCN-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
+; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
+define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+
+  store i32 123, i32 addrspace(3)* %out.gep.1
+  store i32 456, i32 addrspace(3)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
+; GCN: ds_write_b32
+; GCN: ds_write_b32
+; GCN: ds_write_b32
+; GCN: ds_write_b32
+define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
+
+  store i32 123, i32 addrspace(3)* %out.gep.1
+  store i32 456, i32 addrspace(3)* %out.gep.2
+  store i32 333, i32 addrspace(3)* %out.gep.3
+  store i32 1234, i32 addrspace(3)* %out
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll
new file mode 100644
index 00000000000..0332d1a8e40
--- /dev/null
+++ b/test/CodeGen/AMDGPU/min.ll
@@ -0,0 +1,189 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imin_sle_i32
+; SI: v_min_i32_e32
+define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp sle i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imin_sle_i32
+; SI: s_min_i32
+define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_imin_slt_i32
+; SI: v_min_i32_e32
+define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp slt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imin_slt_i32
+; SI: s_min_i32
+define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32:
+; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
+define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %cmp = icmp slt i32 %a, 8
+  %val = select i1 %cmp, i32 %a, i32 8
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32:
+; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
+define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %cmp = icmp sle i32 %a, 8
+  %val = select i1 %cmp, i32 %a, i32 8
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin_ule_i32
+; SI: v_min_u32_e32
+define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ule i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umin_ule_i32
+; SI: s_min_u32
+define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin_ult_i32
+; SI: v_min_u32_e32
+define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ult i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umin_ult_i32
+; SI: s_min_u32
+define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin_ult_i32_multi_use
+; SI-NOT: v_min
+; SI: v_cmp_lt_u32
+; SI-NEXT: v_cndmask_b32
+; SI-NOT: v_min
+; SI: s_endpgm
+define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep0 = getelementptr i32, i32 addrspace(1)* %out0, i32 %tid
+  %outgep1 = getelementptr i1, i1 addrspace(1)* %out1, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ult i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep0, align 4
+  store i1 %cmp, i1 addrspace(1)* %outgep1
+  ret void
+}
+
+; Make sure redundant and removed
+; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
+; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; SI-NEXT: buffer_store_dword [[VMIN]]
+define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
+  %a.ext = zext i16 %a to i32
+  %b.ext = zext i16 %b to i32
+  %cmp = icmp ult i32 %a.ext, %b.ext
+  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
+  %mask = and i32 %val, 65535
+  store i32 %mask, i32 addrspace(1)* %out
+  ret void
+}
+
+; Make sure redundant sign_extend_inreg removed.
+
+; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
+; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; SI-NEXT: buffer_store_dword [[VMIN]]
+define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
+  %a.ext = sext i16 %a to i32
+  %b.ext = sext i16 %b to i32
+  %cmp = icmp slt i32 %a.ext, %b.ext
+  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
+  %shl = shl i32 %val, 16
+  %sextinreg = ashr i32 %shl, 16
+  store i32 %sextinreg, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should get match min/max through extends inserted by
+; legalization.
+
+; FUNC-LABEL: {{^}}s_test_imin_sle_i16:
+; SI: s_sext_i32_i16
+; SI: s_sext_i32_i16
+; SI: v_cmp_le_i32_e32
+; SI: v_cndmask_b32
+define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
+  %cmp = icmp sle i16 %a, %b
+  %val = select i1 %cmp, i16 %a, i16 %b
+  store i16 %val, i16 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/min3.ll b/test/CodeGen/AMDGPU/min3.ll
new file mode 100644
index 00000000000..38ef46d1bdd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/min3.ll
@@ -0,0 +1,111 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imin3_slt_i32
+; SI: v_min3_i32
+define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp slt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp slt i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin3_ult_i32
+; SI: v_min3_u32
+define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp ult i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp ult i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin_umin_umin
+; SI: v_min_i32
+; SI: v_min3_i32
+define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid2 = mul i32 %tid, 2
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+
+  %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2
+  %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2
+  %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2
+
+  %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
+
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %d = load i32, i32 addrspace(1)* %gep3, align 4
+
+  %icmp0 = icmp slt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+
+  %icmp1 = icmp slt i32 %c, %d
+  %i1 = select i1 %icmp1, i32 %c, i32 %d
+
+  %icmp2 = icmp slt i32 %i0, %i1
+  %i2 = select i1 %icmp2, i32 %i0, i32 %i1
+
+  store i32 %i2, i32 addrspace(1)* %outgep1, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin3_2_uses
+; SI-NOT: v_min3
+define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid2 = mul i32 %tid, 2
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+
+  %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2
+  %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2
+  %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2
+
+  %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
+
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %d = load i32, i32 addrspace(1)* %gep3, align 4
+
+  %icmp0 = icmp slt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+
+  %icmp1 = icmp slt i32 %c, %d
+  %i1 = select i1 %icmp1, i32 %c, i32 %d
+
+  %icmp2 = icmp slt i32 %i0, %c
+  %i2 = select i1 %icmp2, i32 %i0, i32 %c
+
+  store i32 %i2, i32 addrspace(1)* %outgep0, align 4
+  store i32 %i0, i32 addrspace(1)* %outgep1, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/missing-store.ll b/test/CodeGen/AMDGPU/missing-store.ll
new file mode 100644
index 00000000000..4af9cdf1b96
--- /dev/null
+++ b/test/CodeGen/AMDGPU/missing-store.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+
+@ptr_load = addrspace(3) global i32 addrspace(2)* undef, align 8
+
+; Make sure when the load from %ptr2 is folded the chain isn't lost,
+; resulting in losing the store to gptr
+
+; FUNC-LABEL: {{^}}missing_store_reduced:
+; SI: ds_read_b64
+; SI: buffer_store_dword
+; SI: buffer_load_dword
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
+
+  store i32 %tmp2, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll
new file mode 100644
index 00000000000..b19163f294e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mubuf.ll
@@ -0,0 +1,183 @@
+; RUN: llc -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.r600.read.tidig.x() readnone
+
+;;;==========================================================================;;;
+;;; MUBUF LOAD TESTS
+;;;==========================================================================;;;
+
+; MUBUF load with an immediate byte offset that fits into 12-bits
+; CHECK-LABEL: {{^}}mubuf_load0:
+; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
+define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1
+  %1 = load i32, i32 addrspace(1)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; MUBUF load with the largest possible immediate offset
+; CHECK-LABEL: {{^}}mubuf_load1:
+; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
+define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095
+  %1 = load i8, i8 addrspace(1)* %0
+  store i8 %1, i8 addrspace(1)* %out
+  ret void
+}
+
+; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
+; CHECK-LABEL: {{^}}mubuf_load2:
+; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
+; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0
+define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024
+  %1 = load i32, i32 addrspace(1)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; MUBUF load with a 12-bit immediate offset and a register offset
+; CHECK-LABEL: {{^}}mubuf_load3:
+; CHECK-NOT: ADD
+; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0
+define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset
+  %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
+  %2 = load i32, i32 addrspace(1)* %1
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}soffset_max_imm:
+; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc
+define void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
+main_body:
+  %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
+  %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
+  %tmp2 = shl i32 %6, 2
+  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %tmp4 = add i32 %6, 16
+  %tmp5 = bitcast float 0.0 to i32
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  ret void
+}
+
+; Make sure immediates that aren't inline constants don't get folded into
+; the soffset operand.
+; FIXME: for this test we should be smart enough to shift the immediate into
+; the offset field.
+; CHECK-LABEL: {{^}}soffset_no_fold:
+; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41
+; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc
+define void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
+main_body:
+  %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
+  %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
+  %tmp2 = shl i32 %6, 2
+  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %tmp4 = add i32 %6, 16
+  %tmp5 = bitcast float 0.0 to i32
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  ret void
+}
+
+;;;==========================================================================;;;
+;;; MUBUF STORE TESTS
+;;;==========================================================================;;;
+
+; MUBUF store with an immediate byte offset that fits into 12-bits
+; CHECK-LABEL: {{^}}mubuf_store0:
+; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0
+define void @mubuf_store0(i32 addrspace(1)* %out) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1
+  store i32 0, i32 addrspace(1)* %0
+  ret void
+}
+
+; MUBUF store with the largest possible immediate offset
+; CHECK-LABEL: {{^}}mubuf_store1:
+; CHECK: buffer_store_byte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0
+
+define void @mubuf_store1(i8 addrspace(1)* %out) {
+entry:
+  %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095
+  store i8 0, i8 addrspace(1)* %0
+  ret void
+}
+
+; MUBUF store with an immediate byte offset that doesn't fit into 12-bits
+; CHECK-LABEL: {{^}}mubuf_store2:
+; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
+; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0
+define void @mubuf_store2(i32 addrspace(1)* %out) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024
+  store i32 0, i32 addrspace(1)* %0
+  ret void
+}
+
+; MUBUF store with a 12-bit immediate offset and a register offset
+; CHECK-LABEL: {{^}}mubuf_store3:
+; CHECK-NOT: ADD
+; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0
+define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset
+  %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
+  store i32 0, i32 addrspace(1)* %1
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_sgpr_ptr:
+; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0
+define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
+  store i32 99, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
+; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
+define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10
+  store i32 99, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
+; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
+; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
+define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
+  store i32 99, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic:
+; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
+; CHECK: buffer_atomic_add v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
+define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_vgpr_ptr:
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  store i32 99, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+attributes #1 = { "ShaderType"="2" "unsafe-fp-math"="true" }
+attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll
new file mode 100644
index 00000000000..94e0f96b323
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mul.ll
@@ -0,0 +1,200 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; mul24 and mad24 are affected
+
+; FUNC-LABEL: {{^}}test_mul_v2i32:
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = mul <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul_v4i32:
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = mul <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32:
+; SI: s_load_dword
+; SI: s_load_dword
+; SI: s_mul_i32
+; SI: buffer_store_dword
+define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+  %mul = mul i64 %b, %a
+  %trunc = trunc i64 %mul to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32:
+; SI: s_load_dword
+; SI: s_load_dword
+; SI: v_mul_lo_i32
+; SI: buffer_store_dword
+define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %mul = mul i64 %b, %a
+  %trunc = trunc i64 %mul to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
+; 32-bits of both arguments are sign bits.
+; FUNC-LABEL: {{^}}mul64_sext_c:
+; EG-DAG: MULLO_INT
+; EG-DAG: MULHI_INT
+; SI-DAG: s_mul_i32
+; SI-DAG: v_mul_hi_i32
+define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = sext i32 %in to i64
+  %1 = mul i64 %0, 80
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul64_sext_c:
+; EG-DAG: MULLO_INT
+; EG-DAG: MULHI_INT
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_i32
+; SI: s_endpgm
+define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ext = sext i32 %val to i64
+  %mul = mul i64 %ext, 80
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm:
+; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
+; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
+; SI: s_endpgm
+define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ext = sext i32 %val to i64
+  %mul = mul i64 %ext, 9
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_mul_i32:
+; SI: s_load_dword [[SRC0:s[0-9]+]],
+; SI: s_load_dword [[SRC1:s[0-9]+]],
+; SI: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %mul = mul i32 %a, %b
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul_i32:
+; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = mul i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; A standard 64-bit multiply.  The expansion should be around 6 instructions.
+; It would be difficult to match the expansion correctly without writing
+; a really complicated list of FileCheck expressions.  I don't want
+; to confuse people who may 'break' this test with a correct optimization,
+; so this test just uses FUNC-LABEL to make sure the compiler does not
+; crash with a 'failed to select' error.
+
+; FUNC-LABEL: {{^}}s_mul_i64:
+define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %mul = mul i64 %a, %b
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul_i64:
+; SI: v_mul_lo_i32
+define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %mul = mul i64 %a, %b
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mul32_in_branch:
+; SI: s_mul_i32
+define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = icmp eq i32 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i32, i32 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = mul i32 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i32 [%1, %if], [%2, %else]
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mul64_in_branch:
+; SI-DAG: s_mul_i32
+; SI-DAG: v_mul_hi_u32
+; SI: s_endpgm
+define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = mul i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/mul_int24.ll b/test/CodeGen/AMDGPU/mul_int24.ll
new file mode 100644
index 00000000000..7609dcc87af
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mul_int24.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+; FUNC-LABEL: {{^}}i32_mul24:
+; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
+; EG: MULLO_INT
+; Make sure we are not masking the inputs
+; CM-NOT: AND
+; CM: MUL_INT24
+; SI-NOT: and
+; SI: v_mul_i32_i24
+define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = shl i32 %a, 8
+  %a_24 = ashr i32 %0, 8
+  %1 = shl i32 %b, 8
+  %b_24 = ashr i32 %1, 8
+  %2 = mul i32 %a_24, %b_24
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/mul_uint24.ll b/test/CodeGen/AMDGPU/mul_uint24.ll
new file mode 100644
index 00000000000..e640a7cd69f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mul_uint24.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+; FUNC-LABEL: {{^}}u32_mul24:
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; SI: v_mul_u32_u24
+
+define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = shl i32 %a, 8
+  %a_24 = lshr i32 %0, 8
+  %1 = shl i32 %b, 8
+  %b_24 = lshr i32 %1, 8
+  %2 = mul i32 %a_24, %b_24
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i16_mul24:
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
+; The result must be sign-extended
+; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
+; EG: 16
+; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
+define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+entry:
+  %0 = mul i16 %a, %b
+  %1 = sext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i8_mul24:
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
+; The result must be sign-extended
+; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
+; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
+
+define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) {
+entry:
+  %0 = mul i8 %a, %b
+  %1 = sext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Multiply with 24-bit inputs and 64-bit output
+; FUNC_LABEL: {{^}}mul24_i64:
+; EG; MUL_UINT24
+; EG: MULHI
+; SI: v_mul_u32_u24
+; FIXME: SI support 24-bit mulhi
+; SI: v_mul_hi_u32
+define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = shl i64 %a, 40
+  %a_24 = lshr i64 %0, 40
+  %1 = shl i64 %b, 40
+  %b_24 = lshr i64 %1, 40
+  %2 = mul i64 %a_24, %b_24
+  store i64 %2, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/mulhu.ll b/test/CodeGen/AMDGPU/mulhu.ll
new file mode 100644
index 00000000000..29b0944a553
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mulhu.ll
@@ -0,0 +1,17 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
+;CHECK: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
+;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+
+define void @test(i32 %p) {
+   %i = udiv i32 %p, 3
+   %r = bitcast i32 %i to float
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
+   ret void
+}
+
+declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
new file mode 100644
index 00000000000..9a814b579de
--- /dev/null
+++ b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=amdgcn -mcpu=SI -o /dev/null %s
+; RUN: llc -march=amdgcn -mcpu=tonga -o /dev/null %s
+; RUN: llc -march=r600 -mcpu=cypress -o /dev/null %s
+
+@extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
+
+; FUNC-LABEL: {{^}}load_extern_const_init:
+define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
+  %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+@undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4
+
+; FUNC-LABEL: {{^}}load_undef_const_init:
+define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
+  %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
new file mode 100644
index 00000000000..e4328ecbaca
--- /dev/null
+++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -0,0 +1,191 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; Make sure we don't turn the 32-bit argument load into a 16-bit
+; load. There aren't extending scalar lods, so that would require
+; using a buffer_load instruction.
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:
+; SI: s_load_dword s
+; SI: buffer_store_short v
+define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
+  %trunc = trunc i32 %arg to i16
+  store i16 %trunc, i16 addrspace(1)* %out
+  ret void
+}
+
+; It should be OK (and probably performance neutral) to reduce this,
+; but we don't know if the load is uniform yet.
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:
+; SI: buffer_load_dword v
+; SI: buffer_store_short v
+define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+  %load = load i32, i32 addrspace(1)* %gep.in
+  %trunc = trunc i32 %load to i16
+  store i16 %trunc, i16 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
+  %trunc = trunc i32 %arg to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %load = load i32, i32 addrspace(1)* %gep.in
+  %trunc = trunc i32 %load to i8
+  store i8 %trunc, i8 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
+  %trunc = trunc i32 %arg to i1
+  store i1 %trunc, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid
+  %load = load i32, i32 addrspace(1)* %gep.in
+  %trunc = trunc i32 %load to i1
+  store i1 %trunc, i1 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
+; SI: s_load_dword s
+; SI: buffer_store_dword v
+define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+  %trunc = trunc i64 %arg to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:
+; SI: buffer_load_dword v
+; SI: buffer_store_dword v
+define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %load = load i64, i64 addrspace(1)* %gep.in
+  %trunc = trunc i64 %load to i32
+  store i32 %trunc, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
+; SI: s_load_dword s
+; SI: buffer_store_dword v
+define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+  %srl = lshr i64 %arg, 32
+  %trunc = trunc i64 %srl to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:
+; SI: buffer_load_dword v
+; SI: buffer_store_dword v
+define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %load = load i64, i64 addrspace(1)* %gep.in
+  %srl = lshr i64 %load, 32
+  %trunc = trunc i64 %srl to i32
+  store i32 %trunc, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; Might as well reduce to 8-bit loads.
+; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
+  %trunc = trunc i16 %arg to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:
+; SI: buffer_load_ubyte v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %load = load i16, i16 addrspace(1)* %gep.in
+  %trunc = trunc i16 %load to i8
+  store i8 %trunc, i8 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+  %srl = lshr i64 %arg, 32
+  %trunc = trunc i64 %srl to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %load = load i64, i64 addrspace(1)* %gep.in
+  %srl = lshr i64 %load, 32
+  %trunc = trunc i64 %srl to i8
+  store i8 %trunc, i8 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+  %trunc = trunc i64 %arg to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %load = load i64, i64 addrspace(1)* %gep.in
+  %trunc = trunc i64 %load to i8
+  store i8 %trunc, i8 addrspace(1)* %gep.out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/operand-folding.ll b/test/CodeGen/AMDGPU/operand-folding.ll
new file mode 100644
index 00000000000..816755efb07
--- /dev/null
+++ b/test/CodeGen/AMDGPU/operand-folding.ll
@@ -0,0 +1,113 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: {{^}}fold_sgpr:
+; CHECK: v_add_i32_e32 v{{[0-9]+}}, s
+define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
+entry:
+  %tmp0 = icmp ne i32 %fold, 0
+  br i1 %tmp0, label %if, label %endif
+
+if:
+  %id = call i32 @llvm.r600.read.tidig.x()
+  %offset = add i32 %fold, %id
+  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
+  store i32 0, i32 addrspace(1)* %tmp1
+  br label %endif
+
+endif:
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fold_imm:
+; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5
+define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
+entry:
+  %fold = add i32 3, 2
+  %tmp0 = icmp ne i32 %cmp, 0
+  br i1 %tmp0, label %if, label %endif
+
+if:
+  %id = call i32 @llvm.r600.read.tidig.x()
+  %val = or i32 %id, %fold
+  store i32 %val, i32 addrspace(1)* %out
+  br label %endif
+
+endif:
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fold_64bit_constant_add:
+; CHECK-NOT: s_mov_b64
+; FIXME: It would be better if we could use v_add here and drop the extra
+; v_mov_b32 instructions.
+; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1
+; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0
+; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},
+
+define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
+entry:
+  %tmp0 = add i64 %val, 1
+  store i64 %tmp0, i64 addrspace(1)* %out
+  ret void
+}
+
+; Inline constants should always be folded.
+
+; CHECK-LABEL: {{^}}vector_inline:
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
+
+define void @vector_inline(<4 x i32> addrspace(1)* %out) {
+entry:
+  %tmp0 = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = add i32 %tmp0, 1
+  %tmp2 = add i32 %tmp0, 2
+  %tmp3 = add i32 %tmp0, 3
+  %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
+  %tmp4 = xor <4 x i32> <i32 5, i32 5, i32 5, i32 5>, %vec3
+  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; Immediates with one use should be folded
+; CHECK-LABEL: {{^}}imm_one_use:
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}}
+
+define void @imm_one_use(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = xor i32 %tmp0, 100
+  store i32 %tmp1, i32 addrspace(1)* %out
+  ret void
+}
+; CHECK-LABEL: {{^}}vector_imm:
+; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
+
+define void @vector_imm(<4 x i32> addrspace(1)* %out) {
+entry:
+  %tmp0 = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = add i32 %tmp0, 1
+  %tmp2 = add i32 %tmp0, 2
+  %tmp3 = add i32 %tmp0, 3
+  %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
+  %tmp4 = xor <4 x i32> <i32 100, i32 100, i32 100, i32 100>, %vec3
+  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/operand-spacing.ll b/test/CodeGen/AMDGPU/operand-spacing.ll
new file mode 100644
index 00000000000..20420a84de6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/operand-spacing.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
+
+; Make sure there isn't an extra space between the instruction name and first operands.
+
+; GCN-LABEL: {{^}}add_f32:
+; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
+; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
+; GCN: buffer_store_dword [[RESULT]],
+define void @add_f32(float addrspace(1)* %out, float %a, float %b) {
+  %result = fadd float %a, %b
+  store float %result, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll
new file mode 100644
index 00000000000..1c04090b407
--- /dev/null
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -0,0 +1,178 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}or_v2i32:
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = or <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}or_v4i32:
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = or <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_or_i32:
+; SI: s_or_b32
+define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %or = or i32 %a, %b
+  store i32 %or, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_or_i32:
+; SI: v_or_b32_e32 v{{[0-9]}}
+define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
+  %loada = load i32, i32 addrspace(1)* %a
+  %or = or i32 %loada, %b
+  store i32 %or, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_or_literal_i32:
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f
+define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
+  %or = or i32 %a, 99999
+  store i32 %or, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_or_literal_i32:
+; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+  %loada = load i32, i32 addrspace(1)* %a, align 4
+  %or = or i32 %loada, 65535
+  store i32 %or, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32:
+; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}}
+define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+  %loada = load i32, i32 addrspace(1)* %a, align 4
+  %or = or i32 %loada, 4
+  store i32 %or, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_or_i64:
+; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
+
+; SI: s_or_b64
+define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %or = or i64 %a, %b
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_or_i64:
+; SI: v_or_b32_e32 v{{[0-9]}}
+; SI: v_or_b32_e32 v{{[0-9]}}
+define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 8
+  %loadb = load i64, i64 addrspace(1)* %a, align 8
+  %or = or i64 %loada, %loadb
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_vector_or_i64:
+; SI: v_or_b32_e32 v{{[0-9]}}
+; SI: v_or_b32_e32 v{{[0-9]}}
+define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
+  %loada = load i64, i64 addrspace(1)* %a
+  %or = or i64 %loada, %b
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_or_i64_loadimm:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f
+; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x146f
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 8
+  %or = or i64 %loada, 22470723082367
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: The or 0 should really be removed.
+; FUNC-LABEL: {{^}}vector_or_i64_imm:
+; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI: v_or_b32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]]
+; SI: v_or_b32_e32 {{v[0-9]+}}, 0, {{.*}}
+; SI: s_endpgm
+define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 8
+  %or = or i64 %loada, 8
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}trunc_i64_or_to_i32:
+; SI: s_load_dword s[[SREG0:[0-9]+]]
+; SI: s_load_dword s[[SREG1:[0-9]+]]
+; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+  %add = or i64 %b, %a
+  %trunc = trunc i64 %add to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}or_i1:
+; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+
+; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
+define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+  %a = load float, float addrspace(1)* %in0
+  %b = load float, float addrspace(1)* %in1
+  %acmp = fcmp oge float %a, 0.000000e+00
+  %bcmp = fcmp oge float %b, 0.000000e+00
+  %or = or i1 %acmp, %bcmp
+  %result = zext i1 %or to i32
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_or_i1:
+; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
+define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+  %cmp0 = icmp eq i32 %a, %b
+  %cmp1 = icmp eq i32 %c, %d
+  %or = or i1 %cmp0, %cmp1
+  store i1 %or, i1 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/packetizer.ll b/test/CodeGen/AMDGPU/packetizer.ll
new file mode 100644
index 00000000000..49a7c0df748
--- /dev/null
+++ b/test/CodeGen/AMDGPU/packetizer.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
+
+; CHECK: {{^}}test:
+; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X
+; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y
+; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z
+; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W
+
+define void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
+entry:
+  %shl = sub i32 32, %e
+  %x = add i32 %x_arg, 1
+  %x.0 = shl i32 %x, %shl
+  %x.1 = lshr i32 %x, %e
+  %x.2 = or i32 %x.0, %x.1
+  %y = add i32 %y_arg, 1
+  %y.0 = shl i32 %y, %shl
+  %y.1 = lshr i32 %y, %e
+  %y.2 = or i32 %y.0, %y.1
+  %z = add i32 %z_arg, 1
+  %z.0 = shl i32 %z, %shl
+  %z.1 = lshr i32 %z, %e
+  %z.2 = or i32 %z.0, %z.1
+  %w = add i32 %w_arg, 1
+  %w.0 = shl i32 %w, %shl
+  %w.1 = lshr i32 %w, %e
+  %w.2 = or i32 %w.0, %w.1
+  %xy = or i32 %x.2, %y.2
+  %zw = or i32 %z.2, %w.2
+  %xyzw = or i32 %xy, %zw
+  store i32 %xyzw, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
new file mode 100644
index 00000000000..f32b044198a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
@@ -0,0 +1,59 @@
+; Function Attrs: nounwind
+; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s
+;
+; CFG flattening should use parallel-and mode to generate branch conditions and
+; then merge if-regions with the same bodies.
+;
+; CHECK: AND_INT
+; CHECK-NEXT: AND_INT
+; CHECK-NEXT: OR_INT
+
+; FIXME: For some reason having the allocas here allowed the flatten cfg pass
+; to do its transfomation, however now that we are using local memory for
+; allocas, the transformation isn't happening.
+
+define void @_Z9chk1D_512v() #0 {
+entry:
+  %a0 = alloca i32, align 4
+  %b0 = alloca i32, align 4
+  %c0 = alloca i32, align 4
+  %d0 = alloca i32, align 4
+  %a1 = alloca i32, align 4
+  %b1 = alloca i32, align 4
+  %c1 = alloca i32, align 4
+  %d1 = alloca i32, align 4
+  %data = alloca i32, align 4
+  %0 = load i32, i32* %a0, align 4
+  %1 = load i32, i32* %b0, align 4
+  %cmp = icmp ne i32 %0, %1
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %2 = load i32, i32* %c0, align 4
+  %3 = load i32, i32* %d0, align 4
+  %cmp1 = icmp ne i32 %2, %3
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true
+  store i32 1, i32* %data, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %land.lhs.true, %entry
+  %4 = load i32, i32* %a1, align 4
+  %5 = load i32, i32* %b1, align 4
+  %cmp2 = icmp ne i32 %4, %5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end6
+
+land.lhs.true3:                                   ; preds = %if.end
+  %6 = load i32, i32* %c1, align 4
+  %7 = load i32, i32* %d1, align 4
+  %cmp4 = icmp ne i32 %6, %7
+  br i1 %cmp4, label %if.then5, label %if.end6
+
+if.then5:                                         ; preds = %land.lhs.true3
+  store i32 1, i32* %data, align 4
+  br label %if.end6
+
+if.end6:                                          ; preds = %if.then5, %land.lhs.true3, %if.end
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/parallelorifcollapse.ll b/test/CodeGen/AMDGPU/parallelorifcollapse.ll
new file mode 100644
index 00000000000..1da1e91b8ab
--- /dev/null
+++ b/test/CodeGen/AMDGPU/parallelorifcollapse.ll
@@ -0,0 +1,66 @@
+; Function Attrs: nounwind
+; RUN: llc < %s -march=r600 -mcpu=redwood  | FileCheck %s
+;
+; CFG flattening should use parallel-or to generate branch conditions and
+; then merge if-regions with the same bodies.
+
+; FIXME: For some reason having the allocas here allowed the flatten cfg pass
+; to do its transfomation, however now that we are using local memory for
+; allocas, the transformation isn't happening.
+; XFAIL: *
+;
+; CHECK: OR_INT
+; CHECK-NEXT: OR_INT
+; CHECK-NEXT: OR_INT
+define void @_Z9chk1D_512v() #0 {
+entry:
+  %a0 = alloca i32, align 4
+  %b0 = alloca i32, align 4
+  %c0 = alloca i32, align 4
+  %d0 = alloca i32, align 4
+  %a1 = alloca i32, align 4
+  %b1 = alloca i32, align 4
+  %c1 = alloca i32, align 4
+  %d1 = alloca i32, align 4
+  %data = alloca i32, align 4
+  %0 = load i32, i32* %a0, align 4
+  %1 = load i32, i32* %b0, align 4
+  %cmp = icmp ne i32 %0, %1
+  br i1 %cmp, label %land.lhs.true, label %if.else
+
+land.lhs.true:                                    ; preds = %entry
+  %2 = load i32, i32* %c0, align 4
+  %3 = load i32, i32* %d0, align 4
+  %cmp1 = icmp ne i32 %2, %3
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %land.lhs.true
+  br label %if.end
+
+if.else:                                          ; preds = %land.lhs.true, %entry
+  store i32 1, i32* %data, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %4 = load i32, i32* %a1, align 4
+  %5 = load i32, i32* %b1, align 4
+  %cmp2 = icmp ne i32 %4, %5
+  br i1 %cmp2, label %land.lhs.true3, label %if.else6
+
+land.lhs.true3:                                   ; preds = %if.end
+  %6 = load i32, i32* %c1, align 4
+  %7 = load i32, i32* %d1, align 4
+  %cmp4 = icmp ne i32 %6, %7
+  br i1 %cmp4, label %if.then5, label %if.else6
+
+if.then5:                                         ; preds = %land.lhs.true3
+  br label %if.end7
+
+if.else6:                                         ; preds = %land.lhs.true3, %if.end
+  store i32 1, i32* %data, align 4
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.else6, %if.then5
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/predicate-dp4.ll b/test/CodeGen/AMDGPU/predicate-dp4.ll
new file mode 100644
index 00000000000..6bc18759435
--- /dev/null
+++ b/test/CodeGen/AMDGPU/predicate-dp4.ll
@@ -0,0 +1,27 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: PRED_SETE_INT * Pred,
+; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one
+define void @main(<4 x float> inreg) #0 {
+main_body:
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = bitcast float %1 to i32
+  %3 = icmp eq i32 %2, 0
+  br i1 %3, label %IF, label %ENDIF
+
+IF:                                             ; preds = %main_body
+  %4 = call float @llvm.AMDGPU.dp4(<4 x float> %0, <4 x float> %0)
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %IF, %main_body
+  %5 = phi float [%4, %IF], [0.000000e+00, %main_body]
+  %6 = insertelement <4 x float> undef, float %5, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %6, i32 0, i32 0)
+  ret void
+}
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+attributes #1 = { readnone }
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/predicates.ll b/test/CodeGen/AMDGPU/predicates.ll
new file mode 100644
index 00000000000..0ce74d97ba8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/predicates.ll
@@ -0,0 +1,104 @@
+; RUN: llc < %s -march=r600 -mattr=disable-irstructurizer -mcpu=redwood | FileCheck %s
+
+; These tests make sure the compiler is optimizing branches using predicates
+; when it is legal to do so.
+
+; CHECK: {{^}}simple_if:
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
+; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
+define void @simple_if(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  br i1 %0, label %IF, label %ENDIF
+
+IF:
+  %1 = shl i32 %in, 1
+  br label %ENDIF
+
+ENDIF:
+  %2 = phi i32 [ %in, %entry ], [ %1, %IF ]
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}simple_if_else:
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
+; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
+; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
+define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  br i1 %0, label %IF, label %ELSE
+
+IF:
+  %1 = shl i32 %in, 1
+  br label %ENDIF
+
+ELSE:
+  %2 = lshr i32 %in, 1
+  br label %ENDIF
+
+ENDIF:
+  %3 = phi i32 [ %1, %IF ], [ %2, %ELSE ]
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}nested_if:
+; CHECK: ALU_PUSH_BEFORE
+; CHECK: JUMP
+; CHECK: POP
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
+; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
+define void @nested_if(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  br i1 %0, label %IF0, label %ENDIF
+
+IF0:
+  %1 = add i32 %in, 10
+  %2 = icmp sgt i32 %1, 0
+  br i1 %2, label %IF1, label %ENDIF
+
+IF1:
+  %3 = shl i32  %1, 1
+  br label %ENDIF
+
+ENDIF:
+  %4 = phi i32 [%in, %entry], [%1, %IF0], [%3, %IF1]
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}nested_if_else:
+; CHECK: ALU_PUSH_BEFORE
+; CHECK: JUMP
+; CHECK: POP
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
+; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
+; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
+define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  br i1 %0, label %IF0, label %ENDIF
+
+IF0:
+  %1 = add i32 %in, 10
+  %2 = icmp sgt i32 %1, 0
+  br i1 %2, label %IF1, label %ELSE1
+
+IF1:
+  %3 = shl i32  %1, 1
+  br label %ENDIF
+
+ELSE1:
+  %4 = lshr i32 %in, 1
+  br label %ENDIF
+
+ENDIF:
+  %5 = phi i32 [%in, %entry], [%3, %IF1], [%4, %ELSE1]
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/private-memory-atomics.ll b/test/CodeGen/AMDGPU/private-memory-atomics.ll
new file mode 100644
index 00000000000..a008ac98a43
--- /dev/null
+++ b/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -0,0 +1,32 @@
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s
+
+; This works because promote allocas pass replaces these with LDS atomics.
+
+; Private atomics have no real use, but at least shouldn't crash on it.
+define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
+  %tmp4 = atomicrmw add i32* %tmp3, i32 7 acq_rel
+  store i32 %tmp4, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
+  %tmp4 = cmpxchg i32* %tmp3, i32 0, i32 1 acq_rel monotonic
+  %val = extractvalue { i32, i1 } %tmp4, 0
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/private-memory-broken.ll b/test/CodeGen/AMDGPU/private-memory-broken.ll
new file mode 100644
index 00000000000..6b18a19f195
--- /dev/null
+++ b/test/CodeGen/AMDGPU/private-memory-broken.ll
@@ -0,0 +1,21 @@
+; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=SI %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=tonga %s -o /dev/null 2>&1 | FileCheck %s
+
+; Make sure promote alloca pass doesn't crash
+
+; CHECK: unsupported call
+
+declare i32 @foo(i32*) nounwind
+
+define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
+  %val = call i32 @foo(i32* %tmp3) nounwind
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/private-memory.ll b/test/CodeGen/AMDGPU/private-memory.ll
new file mode 100644
index 00000000000..1c562978050
--- /dev/null
+++ b/test/CodeGen/AMDGPU/private-memory.ll
@@ -0,0 +1,313 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: {{^}}mova_same_clause:
+
+; R600: LDS_WRITE
+; R600: LDS_WRITE
+; R600: LDS_READ
+; R600: LDS_READ
+
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+
+; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
+; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
+define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+; This test checks that the stack offset is calculated correctly for structs.
+; All register loads/stores should be optimized away, so there shouldn't be
+; any MOVA instructions.
+;
+; XXX: This generated code has unnecessary MOVs, we should be able to optimize
+; this.
+
+; FUNC-LABEL: {{^}}multiple_structs:
+; R600-NOT: MOVA_INT
+; SI-NOT: v_movrel
+; SI-NOT: v_movrel
+%struct.point = type { i32, i32 }
+
+define void @multiple_structs(i32 addrspace(1)* %out) {
+entry:
+  %a = alloca %struct.point
+  %b = alloca %struct.point
+  %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
+  %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1
+  %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
+  %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1
+  store i32 0, i32* %a.x.ptr
+  store i32 1, i32* %a.y.ptr
+  store i32 2, i32* %b.x.ptr
+  store i32 3, i32* %b.y.ptr
+  %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
+  %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
+  %a.indirect = load i32, i32* %a.indirect.ptr
+  %b.indirect = load i32, i32* %b.indirect.ptr
+  %0 = add i32 %a.indirect, %b.indirect
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test direct access of a private array inside a loop.  The private array
+; loads and stores should be lowered to copies, so there shouldn't be any
+; MOVA instructions.
+
+; FUNC-LABEL: {{^}}direct_loop:
+; R600-NOT: MOVA_INT
+; SI-NOT: v_movrel
+
+define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %prv_array_const = alloca [2 x i32]
+  %prv_array = alloca [2 x i32]
+  %a = load i32, i32 addrspace(1)* %in
+  %b_src_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %b = load i32, i32 addrspace(1)* %b_src_ptr
+  %a_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
+  store i32 %a, i32* %a_dst_ptr
+  %b_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
+  store i32 %b, i32* %b_dst_ptr
+  br label %for.body
+
+for.body:
+  %inc = phi i32 [0, %entry], [%count, %for.body]
+  %x_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
+  %x = load i32, i32* %x_ptr
+  %y_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
+  %y = load i32, i32* %y_ptr
+  %xy = add i32 %x, %y
+  store i32 %xy, i32* %y_ptr
+  %count = add i32 %inc, 1
+  %done = icmp eq i32 %count, 4095
+  br i1 %done, label %for.end, label %for.body
+
+for.end:
+  %value_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
+  %value = load i32, i32* %value_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}short_array:
+
+; R600: MOVA_INT
+
+; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
+; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0
+; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+define void @short_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = alloca [2 x i16]
+  %1 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 1
+  store i16 0, i16* %1
+  store i16 1, i16* %2
+  %3 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 %index
+  %4 = load i16, i16* %3
+  %5 = sext i16 %4 to i32
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}char_array:
+
+; R600: MOVA_INT
+
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0
+define void @char_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = alloca [2 x i8]
+  %1 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 1
+  store i8 0, i8* %1
+  store i8 1, i8* %2
+  %3 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 %index
+  %4 = load i8, i8* %3
+  %5 = sext i8 %4 to i32
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+
+}
+
+; Make sure we don't overwrite workitem information with private memory
+
+; FUNC-LABEL: {{^}}work_item_info:
+; R600-NOT: MOV T0.X
+; Additional check in case the move ends up in the last slot
+; R600-NOT: MOV * TO.X
+
+; SI-NOT: v_mov_b32_e{{(32|64)}} v0
+define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = alloca [2 x i32]
+  %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1
+  store i32 0, i32* %1
+  store i32 1, i32* %2
+  %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in
+  %4 = load i32, i32* %3
+  %5 = call i32 @llvm.r600.read.tidig.x()
+  %6 = add i32 %4, %5
+  store i32 %6, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test that two stack objects are not stored in the same register
+; The second stack object should be in T3.X
+; FUNC-LABEL: {{^}}no_overlap:
+; R600_CHECK: MOV
+; R600_CHECK: [[CHAN:[XYZW]]]+
+; R600-NOT: [[CHAN]]+
+; SI: v_mov_b32_e32 v3
+define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = alloca [3 x i8], align 1
+  %1 = alloca [2 x i8], align 1
+  %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0
+  %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1
+  %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2
+  %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0
+  %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1
+  store i8 0, i8* %2
+  store i8 1, i8* %3
+  store i8 2, i8* %4
+  store i8 1, i8* %5
+  store i8 0, i8* %6
+  %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in
+  %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in
+  %9 = load i8, i8* %7
+  %10 = load i8, i8* %8
+  %11 = add i8 %9, %10
+  %12 = sext i8 %11 to i32
+  store i32 %12, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @char_array_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x [2 x i8]]
+  %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
+  store i8 0, i8* %gep0
+  store i8 1, i8* %gep1
+  %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i8, i8* %gep2
+  %sext = sext i8 %load to i32
+  store i32 %sext, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x [2 x i32]]
+  %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i32, i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x [2 x i64]]
+  %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
+  store i64 0, i64* %gep0
+  store i64 1, i64* %gep1
+  %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i64, i64* %gep2
+  store i64 %load, i64 addrspace(1)* %out
+  ret void
+}
+
+%struct.pair32 = type { i32, i32 }
+
+define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x [2 x %struct.pair32]]
+  %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
+  %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
+  %load = load i32, i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x %struct.pair32]
+  %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
+  %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
+  %load = load i32, i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %cmp = icmp eq i32 %in, 0
+  %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
+  %load = load i32, i32* %sel
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+; AMDGPUPromoteAlloca does not know how to handle ptrtoint.  When it
+; finds one, it should stop trying to promote.
+
+; FUNC-LABEL: ptrtoint:
+; SI-NOT: ds_write
+; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5
+define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %alloca = alloca [16 x i32]
+  %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  store i32 5, i32* %tmp0
+  %tmp1 = ptrtoint [16 x i32]* %alloca to i32
+  %tmp2 = add i32 %tmp1, 5
+  %tmp3 = inttoptr i32 %tmp2 to i32*
+  %tmp4 = getelementptr i32, i32* %tmp3, i32 %b
+  %tmp5 = load i32, i32* %tmp4
+  store i32 %tmp5, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/pv-packing.ll b/test/CodeGen/AMDGPU/pv-packing.ll
new file mode 100644
index 00000000000..abeae563ff3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/pv-packing.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
+
+;CHECK: DOT4  T{{[0-9]\.X}}
+;CHECK: MULADD_IEEE * T{{[0-9]\.W}}
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg2, i32 0
+  %4 = extractelement <4 x float> %reg2, i32 1
+  %5 = extractelement <4 x float> %reg2, i32 2
+  %6 = extractelement <4 x float> %reg3, i32 0
+  %7 = extractelement <4 x float> %reg3, i32 1
+  %8 = extractelement <4 x float> %reg3, i32 2
+  %9 = load <4 x float>, <4 x float> addrspace(8)* null
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9)
+  %12 = fmul float %0, %3
+  %13 = fadd float %12, %6
+  %14 = fmul float %1, %4
+  %15 = fadd float %14, %7
+  %16 = fmul float %2, %5
+  %17 = fadd float %16, %8
+  %18 = fmul float %11, %11
+  %19 = fadd float %18, %0
+  %20 = insertelement <4 x float> undef, float %13, i32 0
+  %21 = insertelement <4 x float> %20, float %15, i32 1
+  %22 = insertelement <4 x float> %21, float %17, i32 2
+  %23 = insertelement <4 x float> %22, float %19, i32 3
+  %24 = call float @llvm.AMDGPU.dp4(<4 x float> %23, <4 x float> %10)
+  %25 = insertelement <4 x float> undef, float %24, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %25, i32 0, i32 2)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/pv.ll b/test/CodeGen/AMDGPU/pv.ll
new file mode 100644
index 00000000000..9a57dd19765
--- /dev/null
+++ b/test/CodeGen/AMDGPU/pv.ll
@@ -0,0 +1,241 @@
+; RUN: llc < %s -march=r600 | FileCheck %s
+
+; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
+; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
+  %5 = extractelement <4 x float> %reg2, i32 1
+  %6 = extractelement <4 x float> %reg2, i32 2
+  %7 = extractelement <4 x float> %reg2, i32 3
+  %8 = extractelement <4 x float> %reg3, i32 0
+  %9 = extractelement <4 x float> %reg3, i32 1
+  %10 = extractelement <4 x float> %reg3, i32 2
+  %11 = extractelement <4 x float> %reg3, i32 3
+  %12 = extractelement <4 x float> %reg4, i32 0
+  %13 = extractelement <4 x float> %reg4, i32 1
+  %14 = extractelement <4 x float> %reg4, i32 2
+  %15 = extractelement <4 x float> %reg4, i32 3
+  %16 = extractelement <4 x float> %reg5, i32 0
+  %17 = extractelement <4 x float> %reg5, i32 1
+  %18 = extractelement <4 x float> %reg5, i32 2
+  %19 = extractelement <4 x float> %reg5, i32 3
+  %20 = extractelement <4 x float> %reg6, i32 0
+  %21 = extractelement <4 x float> %reg6, i32 1
+  %22 = extractelement <4 x float> %reg6, i32 2
+  %23 = extractelement <4 x float> %reg6, i32 3
+  %24 = extractelement <4 x float> %reg7, i32 0
+  %25 = extractelement <4 x float> %reg7, i32 1
+  %26 = extractelement <4 x float> %reg7, i32 2
+  %27 = extractelement <4 x float> %reg7, i32 3
+  %28 = load <4 x float>, <4 x float> addrspace(8)* null
+  %29 = extractelement <4 x float> %28, i32 0
+  %30 = fmul float %0, %29
+  %31 = load <4 x float>, <4 x float> addrspace(8)* null
+  %32 = extractelement <4 x float> %31, i32 1
+  %33 = fmul float %0, %32
+  %34 = load <4 x float>, <4 x float> addrspace(8)* null
+  %35 = extractelement <4 x float> %34, i32 2
+  %36 = fmul float %0, %35
+  %37 = load <4 x float>, <4 x float> addrspace(8)* null
+  %38 = extractelement <4 x float> %37, i32 3
+  %39 = fmul float %0, %38
+  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %41 = extractelement <4 x float> %40, i32 0
+  %42 = fmul float %1, %41
+  %43 = fadd float %42, %30
+  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %45 = extractelement <4 x float> %44, i32 1
+  %46 = fmul float %1, %45
+  %47 = fadd float %46, %33
+  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %49 = extractelement <4 x float> %48, i32 2
+  %50 = fmul float %1, %49
+  %51 = fadd float %50, %36
+  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %53 = extractelement <4 x float> %52, i32 3
+  %54 = fmul float %1, %53
+  %55 = fadd float %54, %39
+  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %57 = extractelement <4 x float> %56, i32 0
+  %58 = fmul float %2, %57
+  %59 = fadd float %58, %43
+  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %61 = extractelement <4 x float> %60, i32 1
+  %62 = fmul float %2, %61
+  %63 = fadd float %62, %47
+  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %65 = extractelement <4 x float> %64, i32 2
+  %66 = fmul float %2, %65
+  %67 = fadd float %66, %51
+  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %69 = extractelement <4 x float> %68, i32 3
+  %70 = fmul float %2, %69
+  %71 = fadd float %70, %55
+  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %73 = extractelement <4 x float> %72, i32 0
+  %74 = fmul float %3, %73
+  %75 = fadd float %74, %59
+  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %77 = extractelement <4 x float> %76, i32 1
+  %78 = fmul float %3, %77
+  %79 = fadd float %78, %63
+  %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %81 = extractelement <4 x float> %80, i32 2
+  %82 = fmul float %3, %81
+  %83 = fadd float %82, %67
+  %84 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %85 = extractelement <4 x float> %84, i32 3
+  %86 = fmul float %3, %85
+  %87 = fadd float %86, %71
+  %88 = insertelement <4 x float> undef, float %4, i32 0
+  %89 = insertelement <4 x float> %88, float %5, i32 1
+  %90 = insertelement <4 x float> %89, float %6, i32 2
+  %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 3
+  %92 = insertelement <4 x float> undef, float %4, i32 0
+  %93 = insertelement <4 x float> %92, float %5, i32 1
+  %94 = insertelement <4 x float> %93, float %6, i32 2
+  %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3
+  %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95)
+  %97 = call float @fabs(float %96)
+  %98 = call float @llvm.AMDGPU.rsq.f32(float %97)
+  %99 = fmul float %4, %98
+  %100 = fmul float %5, %98
+  %101 = fmul float %6, %98
+  %102 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %103 = extractelement <4 x float> %102, i32 0
+  %104 = fmul float %103, %8
+  %105 = fadd float %104, %20
+  %106 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %107 = extractelement <4 x float> %106, i32 1
+  %108 = fmul float %107, %9
+  %109 = fadd float %108, %21
+  %110 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %111 = extractelement <4 x float> %110, i32 2
+  %112 = fmul float %111, %10
+  %113 = fadd float %112, %22
+  %114 = call float @llvm.AMDIL.clamp.(float %105, float 0.000000e+00, float 1.000000e+00)
+  %115 = call float @llvm.AMDIL.clamp.(float %109, float 0.000000e+00, float 1.000000e+00)
+  %116 = call float @llvm.AMDIL.clamp.(float %113, float 0.000000e+00, float 1.000000e+00)
+  %117 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
+  %118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %119 = extractelement <4 x float> %118, i32 0
+  %120 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %121 = extractelement <4 x float> %120, i32 1
+  %122 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %123 = extractelement <4 x float> %122, i32 2
+  %124 = insertelement <4 x float> undef, float %99, i32 0
+  %125 = insertelement <4 x float> %124, float %100, i32 1
+  %126 = insertelement <4 x float> %125, float %101, i32 2
+  %127 = insertelement <4 x float> %126, float 0.000000e+00, i32 3
+  %128 = insertelement <4 x float> undef, float %119, i32 0
+  %129 = insertelement <4 x float> %128, float %121, i32 1
+  %130 = insertelement <4 x float> %129, float %123, i32 2
+  %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3
+  %132 = call float @llvm.AMDGPU.dp4(<4 x float> %127, <4 x float> %131)
+  %133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %134 = extractelement <4 x float> %133, i32 0
+  %135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %136 = extractelement <4 x float> %135, i32 1
+  %137 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %138 = extractelement <4 x float> %137, i32 2
+  %139 = insertelement <4 x float> undef, float %99, i32 0
+  %140 = insertelement <4 x float> %139, float %100, i32 1
+  %141 = insertelement <4 x float> %140, float %101, i32 2
+  %142 = insertelement <4 x float> %141, float 0.000000e+00, i32 3
+  %143 = insertelement <4 x float> undef, float %134, i32 0
+  %144 = insertelement <4 x float> %143, float %136, i32 1
+  %145 = insertelement <4 x float> %144, float %138, i32 2
+  %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3
+  %147 = call float @llvm.AMDGPU.dp4(<4 x float> %142, <4 x float> %146)
+  %148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %149 = extractelement <4 x float> %148, i32 0
+  %150 = fmul float %149, %8
+  %151 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %152 = extractelement <4 x float> %151, i32 1
+  %153 = fmul float %152, %9
+  %154 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %155 = extractelement <4 x float> %154, i32 2
+  %156 = fmul float %155, %10
+  %157 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %158 = extractelement <4 x float> %157, i32 0
+  %159 = fmul float %158, %12
+  %160 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %161 = extractelement <4 x float> %160, i32 1
+  %162 = fmul float %161, %13
+  %163 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %164 = extractelement <4 x float> %163, i32 2
+  %165 = fmul float %164, %14
+  %166 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %167 = extractelement <4 x float> %166, i32 0
+  %168 = fmul float %167, %16
+  %169 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %170 = extractelement <4 x float> %169, i32 1
+  %171 = fmul float %170, %17
+  %172 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %173 = extractelement <4 x float> %172, i32 2
+  %174 = fmul float %173, %18
+  %175 = fcmp uge float %132, 0.000000e+00
+  %176 = select i1 %175, float %132, float 0.000000e+00
+  %177 = fcmp uge float %147, 0.000000e+00
+  %178 = select i1 %177, float %147, float 0.000000e+00
+  %179 = call float @llvm.pow.f32(float %178, float %24)
+  %180 = fcmp ult float %132, 0.000000e+00
+  %181 = select i1 %180, float 0.000000e+00, float %179
+  %182 = fadd float %150, %105
+  %183 = fadd float %153, %109
+  %184 = fadd float %156, %113
+  %185 = fmul float %176, %159
+  %186 = fadd float %185, %182
+  %187 = fmul float %176, %162
+  %188 = fadd float %187, %183
+  %189 = fmul float %176, %165
+  %190 = fadd float %189, %184
+  %191 = fmul float %181, %168
+  %192 = fadd float %191, %186
+  %193 = fmul float %181, %171
+  %194 = fadd float %193, %188
+  %195 = fmul float %181, %174
+  %196 = fadd float %195, %190
+  %197 = call float @llvm.AMDIL.clamp.(float %192, float 0.000000e+00, float 1.000000e+00)
+  %198 = call float @llvm.AMDIL.clamp.(float %194, float 0.000000e+00, float 1.000000e+00)
+  %199 = call float @llvm.AMDIL.clamp.(float %196, float 0.000000e+00, float 1.000000e+00)
+  %200 = insertelement <4 x float> undef, float %75, i32 0
+  %201 = insertelement <4 x float> %200, float %79, i32 1
+  %202 = insertelement <4 x float> %201, float %83, i32 2
+  %203 = insertelement <4 x float> %202, float %87, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %203, i32 60, i32 1)
+  %204 = insertelement <4 x float> undef, float %197, i32 0
+  %205 = insertelement <4 x float> %204, float %198, i32 1
+  %206 = insertelement <4 x float> %205, float %199, i32 2
+  %207 = insertelement <4 x float> %206, float %117, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %207, i32 0, i32 2)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+; Function Attrs: readonly
+declare float @fabs(float) #2
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq.f32(float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.clamp.(float, float, float) #1
+
+; Function Attrs: nounwind readonly
+declare float @llvm.pow.f32(float, float) #3
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
+attributes #2 = { readonly }
+attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/r600-encoding.ll b/test/CodeGen/AMDGPU/r600-encoding.ll
new file mode 100644
index 00000000000..3a82ee30a32
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600-encoding.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=redwood | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rs880 | FileCheck --check-prefix=R600 %s
+
+; The earliest R600 GPUs have a slightly different encoding than the rest of
+; the VLIW4/5 GPUs.
+
+; EG: {{^}}test:
+; EG: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}]
+
+; R600: {{^}}test:
+; R600: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}]
+
+define void @test(<4 x float> inreg %reg0) #0 {
+entry:
+  %r0 = extractelement <4 x float> %reg0, i32 0
+  %r1 = extractelement <4 x float> %reg0, i32 1
+  %r2 = fmul float %r0, %r1
+  %vec = insertelement <4 x float> undef, float %r2, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+  ret void
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/r600-export-fix.ll b/test/CodeGen/AMDGPU/r600-export-fix.ll
new file mode 100644
index 00000000000..7cb80195b36
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600-export-fix.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s
+
+;CHECK:	EXPORT T{{[0-9]}}.XYZW
+;CHECK:	EXPORT T{{[0-9]}}.0000
+;CHECK: EXPORT T{{[0-9]}}.0000
+;CHECK: EXPORT T{{[0-9]}}.0XYZ
+;CHECK: EXPORT T{{[0-9]}}.XYZW
+;CHECK: EXPORT T{{[0-9]}}.YZ00
+;CHECK: EXPORT T{{[0-9]}}.0000
+;CHECK: EXPORT T{{[0-9]}}.0000
+
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = fmul float %5, %0
+  %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %8 = extractelement <4 x float> %7, i32 1
+  %9 = fmul float %8, %0
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %11 = extractelement <4 x float> %10, i32 2
+  %12 = fmul float %11, %0
+  %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %14 = extractelement <4 x float> %13, i32 3
+  %15 = fmul float %14, %0
+  %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %17 = extractelement <4 x float> %16, i32 0
+  %18 = fmul float %17, %1
+  %19 = fadd float %18, %6
+  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %21 = extractelement <4 x float> %20, i32 1
+  %22 = fmul float %21, %1
+  %23 = fadd float %22, %9
+  %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %25 = extractelement <4 x float> %24, i32 2
+  %26 = fmul float %25, %1
+  %27 = fadd float %26, %12
+  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %29 = extractelement <4 x float> %28, i32 3
+  %30 = fmul float %29, %1
+  %31 = fadd float %30, %15
+  %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %33 = extractelement <4 x float> %32, i32 0
+  %34 = fmul float %33, %2
+  %35 = fadd float %34, %19
+  %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %37 = extractelement <4 x float> %36, i32 1
+  %38 = fmul float %37, %2
+  %39 = fadd float %38, %23
+  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %41 = extractelement <4 x float> %40, i32 2
+  %42 = fmul float %41, %2
+  %43 = fadd float %42, %27
+  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %45 = extractelement <4 x float> %44, i32 3
+  %46 = fmul float %45, %2
+  %47 = fadd float %46, %31
+  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %49 = extractelement <4 x float> %48, i32 0
+  %50 = fmul float %49, %3
+  %51 = fadd float %50, %35
+  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %53 = extractelement <4 x float> %52, i32 1
+  %54 = fmul float %53, %3
+  %55 = fadd float %54, %39
+  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %57 = extractelement <4 x float> %56, i32 2
+  %58 = fmul float %57, %3
+  %59 = fadd float %58, %43
+  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %61 = extractelement <4 x float> %60, i32 3
+  %62 = fmul float %61, %3
+  %63 = fadd float %62, %47
+  %64 = load <4 x float>, <4 x float> addrspace(8)* null
+  %65 = extractelement <4 x float> %64, i32 0
+  %66 = load <4 x float>, <4 x float> addrspace(8)* null
+  %67 = extractelement <4 x float> %66, i32 1
+  %68 = load <4 x float>, <4 x float> addrspace(8)* null
+  %69 = extractelement <4 x float> %68, i32 2
+  %70 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %71 = extractelement <4 x float> %70, i32 0
+  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %73 = extractelement <4 x float> %72, i32 1
+  %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %75 = extractelement <4 x float> %74, i32 2
+  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %77 = extractelement <4 x float> %76, i32 0
+  %78 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %79 = extractelement <4 x float> %78, i32 1
+  %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %81 = extractelement <4 x float> %80, i32 2
+  %82 = insertelement <4 x float> undef, float %51, i32 0
+  %83 = insertelement <4 x float> %82, float %55, i32 1
+  %84 = insertelement <4 x float> %83, float %59, i32 2
+  %85 = insertelement <4 x float> %84, float %63, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %85, i32 60, i32 1)
+  %86 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %87 = insertelement <4 x float> %86, float 0.000000e+00, i32 1
+  %88 = insertelement <4 x float> %87, float 0.000000e+00, i32 2
+  %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %89, i32 0, i32 2)
+  %90 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 1
+  %92 = insertelement <4 x float> %91, float 0.000000e+00, i32 2
+  %93 = insertelement <4 x float> %92, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %93, i32 1, i32 2)
+  %94 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %95 = insertelement <4 x float> %94, float %65, i32 1
+  %96 = insertelement <4 x float> %95, float %67, i32 2
+  %97 = insertelement <4 x float> %96, float %69, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %97, i32 2, i32 2)
+  %98 = insertelement <4 x float> undef, float %77, i32 0
+  %99 = insertelement <4 x float> %98, float %79, i32 1
+  %100 = insertelement <4 x float> %99, float %81, i32 2
+  %101 = insertelement <4 x float> %100, float %71, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %101, i32 3, i32 2)
+  %102 = insertelement <4 x float> undef, float %73, i32 0
+  %103 = insertelement <4 x float> %102, float %75, i32 1
+  %104 = insertelement <4 x float> %103, float 0.000000e+00, i32 2
+  %105 = insertelement <4 x float> %104, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %105, i32 4, i32 2)
+  %106 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 1
+  %108 = insertelement <4 x float> %107, float 0.000000e+00, i32 2
+  %109 = insertelement <4 x float> %108, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %109, i32 5, i32 2)
+  %110 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 1
+  %112 = insertelement <4 x float> %111, float 0.000000e+00, i32 2
+  %113 = insertelement <4 x float> %112, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %113, i32 6, i32 2)
+  ret void
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
new file mode 100644
index 00000000000..f388f8ffe29
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
@@ -0,0 +1,58 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman
+
+define void @main(<4 x float> inreg, <4 x float> inreg) #0 {
+main_body:
+  %2 = extractelement <4 x float> %0, i32 0
+  %3 = extractelement <4 x float> %0, i32 1
+  %4 = extractelement <4 x float> %0, i32 2
+  %5 = extractelement <4 x float> %0, i32 3
+  %6 = insertelement <4 x float> undef, float %2, i32 0
+  %7 = insertelement <4 x float> %6, float %3, i32 1
+  %8 = insertelement <4 x float> %7, float %4, i32 2
+  %9 = insertelement <4 x float> %8, float %5, i32 3
+  %10 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %9)
+  %11 = extractelement <4 x float> %10, i32 0
+  %12 = extractelement <4 x float> %10, i32 1
+  %13 = extractelement <4 x float> %10, i32 2
+  %14 = extractelement <4 x float> %10, i32 3
+  %15 = call float @fabs(float %13)
+  %16 = fdiv float 1.000000e+00, %15
+  %17 = fmul float %11, %16
+  %18 = fadd float %17, 1.500000e+00
+  %19 = fmul float %12, %16
+  %20 = fadd float %19, 1.500000e+00
+  %21 = insertelement <4 x float> undef, float %20, i32 0
+  %22 = insertelement <4 x float> %21, float %18, i32 1
+  %23 = insertelement <4 x float> %22, float %14, i32 2
+  %24 = insertelement <4 x float> %23, float %5, i32 3
+  %25 = extractelement <4 x float> %24, i32 0
+  %26 = extractelement <4 x float> %24, i32 1
+  %27 = extractelement <4 x float> %24, i32 2
+  %28 = extractelement <4 x float> %24, i32 3
+  %29 = insertelement <4 x float> undef, float %25, i32 0
+  %30 = insertelement <4 x float> %29, float %26, i32 1
+  %31 = insertelement <4 x float> %30, float %27, i32 2
+  %32 = insertelement <4 x float> %31, float %28, i32 3
+  %33 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %32, i32 16, i32 0, i32 13)
+  %34 = extractelement <4 x float> %33, i32 0
+  %35 = insertelement <4 x float> undef, float %34, i32 0
+  %36 = insertelement <4 x float> %35, float %34, i32 1
+  %37 = insertelement <4 x float> %36, float %34, i32 2
+  %38 = insertelement <4 x float> %37, float 1.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %38, i32 0, i32 0)
+  ret void
+}
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
+
+; Function Attrs: readnone
+declare float @fabs(float) #1
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/r600cfg.ll b/test/CodeGen/AMDGPU/r600cfg.ll
new file mode 100644
index 00000000000..c7b9d65220f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600cfg.ll
@@ -0,0 +1,119 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = bitcast float %0 to i32
+  %5 = icmp eq i32 %4, 0
+  %6 = sext i1 %5 to i32
+  %7 = bitcast i32 %6 to float
+  %8 = bitcast float %7 to i32
+  %9 = icmp ne i32 %8, 0
+  %. = select i1 %9, float 0x36A0000000000000, float %0
+  br label %LOOP
+
+LOOP:                                             ; preds = %LOOP47, %main_body
+  %temp12.0 = phi float [ 0x36A0000000000000, %main_body ], [ %temp12.1, %LOOP47 ]
+  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %38, %LOOP47 ]
+  %temp4.1 = phi float [ %., %main_body ], [ %52, %LOOP47 ]
+  %10 = bitcast float %temp4.1 to i32
+  %11 = icmp eq i32 %10, 1
+  %12 = sext i1 %11 to i32
+  %13 = bitcast i32 %12 to float
+  %14 = bitcast float %13 to i32
+  %15 = icmp ne i32 %14, 0
+  br i1 %15, label %IF41, label %ENDIF40
+
+IF41:                                             ; preds = %LOOP
+  %16 = insertelement <4 x float> undef, float %0, i32 0
+  %17 = insertelement <4 x float> %16, float %temp8.0, i32 1
+  %18 = insertelement <4 x float> %17, float %temp12.0, i32 2
+  %19 = insertelement <4 x float> %18, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.stream.output(<4 x float> %19, i32 0, i32 0, i32 1)
+  %20 = insertelement <4 x float> undef, float %0, i32 0
+  %21 = insertelement <4 x float> %20, float %temp8.0, i32 1
+  %22 = insertelement <4 x float> %21, float %temp12.0, i32 2
+  %23 = insertelement <4 x float> %22, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.stream.output(<4 x float> %23, i32 0, i32 0, i32 2)
+  %24 = insertelement <4 x float> undef, float %0, i32 0
+  %25 = insertelement <4 x float> %24, float %temp8.0, i32 1
+  %26 = insertelement <4 x float> %25, float %temp12.0, i32 2
+  %27 = insertelement <4 x float> %26, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.stream.output(<4 x float> %27, i32 0, i32 0, i32 4)
+  %28 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %29 = insertelement <4 x float> %28, float 0.000000e+00, i32 1
+  %30 = insertelement <4 x float> %29, float 0.000000e+00, i32 2
+  %31 = insertelement <4 x float> %30, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %31, i32 60, i32 1)
+  %32 = insertelement <4 x float> undef, float %0, i32 0
+  %33 = insertelement <4 x float> %32, float %temp8.0, i32 1
+  %34 = insertelement <4 x float> %33, float %temp12.0, i32 2
+  %35 = insertelement <4 x float> %34, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %35, i32 0, i32 2)
+  ret void
+
+ENDIF40:                                          ; preds = %LOOP
+  %36 = bitcast float %temp8.0 to i32
+  %37 = add i32 %36, 1
+  %38 = bitcast i32 %37 to float
+  %39 = bitcast float %temp4.1 to i32
+  %40 = urem i32 %39, 2
+  %41 = bitcast i32 %40 to float
+  %42 = bitcast float %41 to i32
+  %43 = icmp eq i32 %42, 0
+  %44 = sext i1 %43 to i32
+  %45 = bitcast i32 %44 to float
+  %46 = bitcast float %45 to i32
+  %47 = icmp ne i32 %46, 0
+  %48 = bitcast float %temp4.1 to i32
+  br i1 %47, label %IF44, label %ELSE45
+
+IF44:                                             ; preds = %ENDIF40
+  %49 = udiv i32 %48, 2
+  br label %ENDIF43
+
+ELSE45:                                           ; preds = %ENDIF40
+  %50 = mul i32 3, %48
+  %51 = add i32 %50, 1
+  br label %ENDIF43
+
+ENDIF43:                                          ; preds = %ELSE45, %IF44
+  %.sink = phi i32 [ %49, %IF44 ], [ %51, %ELSE45 ]
+  %52 = bitcast i32 %.sink to float
+  %53 = load <4 x float>, <4 x float> addrspace(8)* null
+  %54 = extractelement <4 x float> %53, i32 0
+  %55 = bitcast float %54 to i32
+  br label %LOOP47
+
+LOOP47:                                           ; preds = %ENDIF48, %ENDIF43
+  %temp12.1 = phi float [ %temp12.0, %ENDIF43 ], [ %67, %ENDIF48 ]
+  %temp28.0 = phi float [ 0.000000e+00, %ENDIF43 ], [ %70, %ENDIF48 ]
+  %56 = bitcast float %temp28.0 to i32
+  %57 = icmp uge i32 %56, %55
+  %58 = sext i1 %57 to i32
+  %59 = bitcast i32 %58 to float
+  %60 = bitcast float %59 to i32
+  %61 = icmp ne i32 %60, 0
+  br i1 %61, label %LOOP, label %ENDIF48
+
+ENDIF48:                                          ; preds = %LOOP47
+  %62 = bitcast float %temp12.1 to i32
+  %63 = mul i32 %62, 2
+  %64 = bitcast i32 %63 to float
+  %65 = bitcast float %64 to i32
+  %66 = urem i32 %65, 2147483647
+  %67 = bitcast i32 %66 to float
+  %68 = bitcast float %temp28.0 to i32
+  %69 = add i32 %68, 1
+  %70 = bitcast i32 %69 to float
+  br label %LOOP47
+}
+
+declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32)
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/reciprocal.ll b/test/CodeGen/AMDGPU/reciprocal.ll
new file mode 100644
index 00000000000..b4ac47afced
--- /dev/null
+++ b/test/CodeGen/AMDGPU/reciprocal.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> inreg %reg0) #0  {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = fdiv float 1.0, %r0
+   %vec = insertelement <4 x float> undef, float %r1, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/register-count-comments.ll b/test/CodeGen/AMDGPU/register-count-comments.ll
new file mode 100644
index 00000000000..de6bfb31088
--- /dev/null
+++ b/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.SI.tid() nounwind readnone
+
+; SI-LABEL: {{^}}foo:
+; SI: .section	.AMDGPU.csdata
+; SI: ; Kernel info:
+; SI: ; NumSgprs: {{[0-9]+}}
+; SI: ; NumVgprs: {{[0-9]+}}
+define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
+  %tid = call i32 @llvm.SI.tid() nounwind readnone
+  %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid
+  %bptr = getelementptr i32, i32 addrspace(1)* %bbase, i32 %tid
+  %outptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %outptr, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}one_vgpr_used:
+; SI: NumVgprs: 1
+define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind {
+  store i32 %x, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll
new file mode 100644
index 00000000000..187650ff9a5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -0,0 +1,105 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store:
+; SI: buffer_load_dwordx2
+; SI: buffer_load_dwordx2
+; SI: buffer_load_dwordx2
+; SI: buffer_load_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
+  %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16
+  %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16
+  store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16
+  store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store:
+; SI: ds_read_b64
+; SI: ds_read_b64
+; SI: ds_write_b64
+; SI: ds_write_b64
+; SI: s_endpgm
+define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
+  %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16
+  %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16
+  store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16
+  store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
+  %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32
+  %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32
+  store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32
+  store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32
+  ret void
+}
+
+; SI-LABEL: {{^}}no_reorder_extload_64:
+; SI: ds_read_b64
+; SI: ds_read_b64
+; SI: ds_write_b64
+; SI-NOT: ds_read
+; SI: ds_write_b64
+; SI: s_endpgm
+define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
+  %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8
+  %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8
+  %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4ext = zext <2 x i32> %tmp4 to <2 x i64>
+  %tmp7 = add <2 x i64> %tmp1ext, <i64 1, i64 1>
+  %tmp9 = add <2 x i64> %tmp4ext, <i64 1, i64 1>
+  %trunctmp9 = trunc <2 x i64> %tmp9 to <2 x i32>
+  %trunctmp7 = trunc <2 x i64> %tmp7 to <2 x i32>
+  store <2 x i32> %trunctmp9, <2 x i32> addrspace(3)* %x, align 8
+  store <2 x i32> %trunctmp7, <2 x i32> addrspace(3)* %y, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/rotl.i64.ll b/test/CodeGen/AMDGPU/rotl.i64.ll
new file mode 100644
index 00000000000..3f4ceb7e031
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rotl.i64.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
+
+; BOTH-LABEL: {{^}}s_rotl_i64:
+; BOTH-DAG: s_lshl_b64
+; BOTH-DAG: s_sub_i32
+; BOTH-DAG: s_lshr_b64
+; BOTH: s_or_b64
+; BOTH: s_endpgm
+define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+entry:
+  %0 = shl i64 %x, %y
+  %1 = sub i64 64, %y
+  %2 = lshr i64 %x, %1
+  %3 = or i64 %0, %2
+  store i64 %3, i64 addrspace(1)* %in
+  ret void
+}
+
+; BOTH-LABEL: {{^}}v_rotl_i64:
+; SI-DAG: v_lshl_b64
+; VI-DAG: v_lshlrev_b64
+; BOTH-DAG: v_sub_i32
+; SI: v_lshr_b64
+; VI: v_lshrrev_b64
+; BOTH: v_or_b32
+; BOTH: v_or_b32
+; BOTH: s_endpgm
+define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+entry:
+  %x = load i64, i64 addrspace(1)* %xptr, align 8
+  %y = load i64, i64 addrspace(1)* %yptr, align 8
+  %tmp0 = shl i64 %x, %y
+  %tmp1 = sub i64 64, %y
+  %tmp2 = lshr i64 %x, %tmp1
+  %tmp3 = or i64 %tmp0, %tmp2
+  store i64 %tmp3, i64 addrspace(1)* %in, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/rotl.ll b/test/CodeGen/AMDGPU/rotl.ll
new file mode 100644
index 00000000000..6c144cd56ea
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rotl.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}rotl_i32:
+; R600: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
+; R600-NEXT: 32
+; R600: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}}
+
+; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
+; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]]
+; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]]
+define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+entry:
+  %0 = shl i32 %x, %y
+  %1 = sub i32 32, %y
+  %2 = lshr i32 %x, %1
+  %3 = or i32 %0, %2
+  store i32 %3, i32 addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rotl_v2i32:
+; SI-DAG: s_sub_i32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI: s_endpgm
+define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+entry:
+  %0 = shl <2 x i32> %x, %y
+  %1 = sub <2 x i32> <i32 32, i32 32>, %y
+  %2 = lshr <2 x i32> %x, %1
+  %3 = or <2 x i32> %0, %2
+  store <2 x i32> %3, <2 x i32> addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rotl_v4i32:
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI: s_endpgm
+define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+entry:
+  %0 = shl <4 x i32> %x, %y
+  %1 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
+  %2 = lshr <4 x i32> %x, %1
+  %3 = or <4 x i32> %0, %2
+  store <4 x i32> %3, <4 x i32> addrspace(1)* %in
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/rotr.i64.ll b/test/CodeGen/AMDGPU/rotr.i64.ll
new file mode 100644
index 00000000000..586de44a566
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rotr.i64.ll
@@ -0,0 +1,61 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
+
+; BOTH-LABEL: {{^}}s_rotr_i64:
+; BOTH-DAG: s_sub_i32
+; BOTH-DAG: s_lshr_b64
+; BOTH-DAG: s_lshl_b64
+; BOTH: s_or_b64
+define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+entry:
+  %tmp0 = sub i64 64, %y
+  %tmp1 = shl i64 %x, %tmp0
+  %tmp2 = lshr i64 %x, %y
+  %tmp3 = or i64 %tmp1, %tmp2
+  store i64 %tmp3, i64 addrspace(1)* %in
+  ret void
+}
+
+; BOTH-LABEL: {{^}}v_rotr_i64:
+; BOTH-DAG: v_sub_i32
+; SI-DAG: v_lshr_b64
+; SI-DAG: v_lshl_b64
+; VI-DAG: v_lshrrev_b64
+; VI-DAG: v_lshlrev_b64
+; BOTH: v_or_b32
+; BOTH: v_or_b32
+define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+entry:
+  %x = load i64, i64 addrspace(1)* %xptr, align 8
+  %y = load i64, i64 addrspace(1)* %yptr, align 8
+  %tmp0 = sub i64 64, %y
+  %tmp1 = shl i64 %x, %tmp0
+  %tmp2 = lshr i64 %x, %y
+  %tmp3 = or i64 %tmp1, %tmp2
+  store i64 %tmp3, i64 addrspace(1)* %in
+  ret void
+}
+
+; BOTH-LABEL: {{^}}s_rotr_v2i64:
+define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
+entry:
+  %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
+  %tmp1 = shl <2 x i64> %x, %tmp0
+  %tmp2 = lshr <2 x i64> %x, %y
+  %tmp3 = or <2 x i64> %tmp1, %tmp2
+  store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in
+  ret void
+}
+
+; BOTH-LABEL: {{^}}v_rotr_v2i64:
+define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
+entry:
+  %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8
+  %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8
+  %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
+  %tmp1 = shl <2 x i64> %x, %tmp0
+  %tmp2 = lshr <2 x i64> %x, %y
+  %tmp3 = or <2 x i64> %tmp1, %tmp2
+  store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/rotr.ll b/test/CodeGen/AMDGPU/rotr.ll
new file mode 100644
index 00000000000..044f9ffe6d6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rotr.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}rotr_i32:
+; R600: BIT_ALIGN_INT
+
+; SI: v_alignbit_b32
+define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+entry:
+  %tmp0 = sub i32 32, %y
+  %tmp1 = shl i32 %x, %tmp0
+  %tmp2 = lshr i32 %x, %y
+  %tmp3 = or i32 %tmp1, %tmp2
+  store i32 %tmp3, i32 addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rotr_v2i32:
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
+
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
+define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+entry:
+  %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
+  %tmp1 = shl <2 x i32> %x, %tmp0
+  %tmp2 = lshr <2 x i32> %x, %y
+  %tmp3 = or <2 x i32> %tmp1, %tmp2
+  store <2 x i32> %tmp3, <2 x i32> addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rotr_v4i32:
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
+
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
+define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+entry:
+  %tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
+  %tmp1 = shl <4 x i32> %x, %tmp0
+  %tmp2 = lshr <4 x i32> %x, %y
+  %tmp3 = or <4 x i32> %tmp1, %tmp2
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %in
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/rsq.ll b/test/CodeGen/AMDGPU/rsq.ll
new file mode 100644
index 00000000000..b67b800c737
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rsq.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare float @llvm.sqrt.f32(float) nounwind readnone
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+; SI-LABEL: {{^}}rsq_f32:
+; SI: v_rsq_f32_e32
+; SI: s_endpgm
+define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %val = load float, float addrspace(1)* %in, align 4
+  %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
+  %div = fdiv float 1.0, %sqrt
+  store float %div, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}rsq_f64:
+; SI-UNSAFE: v_rsq_f64_e32
+; SI-SAFE: v_sqrt_f64_e32
+; SI: s_endpgm
+define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+  %val = load double, double addrspace(1)* %in, align 4
+  %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone
+  %div = fdiv double 1.0, %sqrt
+  store double %div, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}rsq_f32_sgpr:
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; SI: s_endpgm
+define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
+  %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
+  %div = fdiv float 1.0, %sqrt
+  store float %div, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Recognize that this is rsqrt(a) * rcp(b) * c,
+; not 1 / ( 1 / sqrt(a)) * rcp(b) * c.
+
+; SI-LABEL: @rsqrt_fmul
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+
+; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]]
+; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]]
+; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]]
+; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+; SI-UNSAFE: buffer_store_dword [[RESULT]]
+
+; SI-SAFE-NOT: v_rsq_f32
+
+; SI: s_endpgm
+define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %x = call float @llvm.sqrt.f32(float %a)
+  %y = fmul float %x, %b
+  %z = fdiv float %c, %y
+  store float %z, float addrspace(1)* %out.gep
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/rv7x0_count3.ll b/test/CodeGen/AMDGPU/rv7x0_count3.ll
new file mode 100644
index 00000000000..c3fd923e459
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rv7x0_count3.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=r600 -show-mc-encoding  -mcpu=rv710 | FileCheck %s
+
+; CHECK: TEX 9 @6 ;  encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80]
+
+define void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+   %1 = extractelement <4 x float> %reg1, i32 0
+   %2 = extractelement <4 x float> %reg1, i32 1
+   %3 = extractelement <4 x float> %reg1, i32 2
+   %4 = extractelement <4 x float> %reg1, i32 3
+   %5 = insertelement <4 x float> undef, float %1, i32 0
+   %6 = insertelement <4 x float> %5, float %2, i32 1
+   %7 = insertelement <4 x float> %6, float %3, i32 2
+   %8 = insertelement <4 x float> %7, float %4, i32 3
+   %9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
+   %10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 1, i32 0, i32 1)
+   %11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 2, i32 0, i32 1)
+   %12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 3, i32 0, i32 1)
+   %13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 4, i32 0, i32 1)
+   %14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 5, i32 0, i32 1)
+   %15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 6, i32 0, i32 1)
+   %16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 7, i32 0, i32 1)
+   %17 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 8, i32 0, i32 1)
+   %18 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 9, i32 0, i32 1)
+   %19 = fadd <4 x float> %9, %10
+   %20 = fadd <4 x float> %19, %11
+   %21 = fadd <4 x float> %20, %12
+   %22 = fadd <4 x float> %21, %13
+   %23 = fadd <4 x float> %22, %14
+   %24 = fadd <4 x float> %23, %15
+   %25 = fadd <4 x float> %24, %16
+   %26 = fadd <4 x float> %25, %17
+   %27 = fadd <4 x float> %26, %18
+   call void @llvm.R600.store.swizzle(<4 x float> %27, i32 0, i32 2)
+   ret void
+}
+
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll
new file mode 100644
index 00000000000..6b1a36c979c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -0,0 +1,185 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_movk_i32_k0:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k1:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k2:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 64{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k3:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k4:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k5:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0xffef{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0xff00ffff{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k6:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 63{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 270582939713 ; 65 | (63 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k7:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x2000{{$}}
+; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x4000{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+
+; SI-LABEL: {{^}}s_movk_i32_k8:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k9:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8001{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k10:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8888{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k11:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8fff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k12:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff7001{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/saddo.ll b/test/CodeGen/AMDGPU/saddo.ll
new file mode 100644
index 00000000000..f8ced7942a6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/saddo.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+
+; FUNC-LABEL: {{^}}saddo_i64_zext:
+define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %sadd, 0
+  %carry = extractvalue { i64, i1 } %sadd, 1
+  %ext = zext i1 %carry to i64
+  %add2 = add i64 %val, %ext
+  store i64 %add2, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_saddo_i32:
+define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+  %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %sadd, 0
+  %carry = extractvalue { i32, i1 } %sadd, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_saddo_i32:
+define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %sadd, 0
+  %carry = extractvalue { i32, i1 } %sadd, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_saddo_i64:
+define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %sadd, 0
+  %carry = extractvalue { i64, i1 } %sadd, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_saddo_i64:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64, i64 addrspace(1)* %aptr, align 4
+  %b = load i64, i64 addrspace(1)* %bptr, align 4
+  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %sadd, 0
+  %carry = extractvalue { i64, i1 } %sadd, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll
new file mode 100644
index 00000000000..0b964957654
--- /dev/null
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -0,0 +1,118 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; In this test both the pointer and the offset operands to the
+; BUFFER_LOAD instructions end up being stored in vgprs.  This
+; requires us to add the pointer and offset together, store the
+; result in the offset operand (vaddr), and then store 0 in an
+; sgpr register pair and use that for the pointer operand
+; (low 64-bits of srsrc).
+
+; CHECK-LABEL: {{^}}mubuf:
+
+; Make sure we aren't using VGPRs for the source operand of s_mov_b64
+; CHECK-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v
+
+; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_*
+; instructions
+; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x() #1
+  %1 = call i32 @llvm.r600.read.tidig.y() #1
+  %2 = sext i32 %0 to i64
+  %3 = sext i32 %1 to i64
+  br label %loop
+
+loop:
+  %4 = phi i64 [0, %entry], [%5, %loop]
+  %5 = add i64 %2, %4
+  %6 = getelementptr i8, i8 addrspace(1)* %in, i64 %5
+  %7 = load i8, i8 addrspace(1)* %6, align 1
+  %8 = or i64 %5, 1
+  %9 = getelementptr i8, i8 addrspace(1)* %in, i64 %8
+  %10 = load i8, i8 addrspace(1)* %9, align 1
+  %11 = add i8 %7, %10
+  %12 = sext i8 %11 to i32
+  store i32 %12, i32 addrspace(1)* %out
+  %13 = icmp slt i64 %5, 10
+  br i1 %13, label %loop, label %done
+
+done:
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.r600.read.tidig.y() #1
+
+attributes #1 = { nounwind readnone }
+
+; Test moving an SMRD instruction to the VALU
+
+; CHECK-LABEL: {{^}}smrd_valu:
+; CHECK: buffer_load_dword [[OUT:v[0-9]+]]
+; CHECK: buffer_store_dword [[OUT]]
+
+define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 addrspace(1)* %out) {
+entry:
+  %0 = icmp ne i32 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
+  %3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %2
+  br label %endif
+
+endif:
+  %4 = phi i32 addrspace(2)*  [%1, %if], [%3, %else]
+  %5 = getelementptr i32, i32 addrspace(2)* %4, i32 3000
+  %6 = load i32, i32 addrspace(2)* %5
+  store i32 %6, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test moving ann SMRD with an immediate offset to the VALU
+
+; CHECK-LABEL: {{^}}smrd_valu2:
+; CHECK: buffer_load_dword
+define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %1 = add i32 %0, 4
+  %2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %0, i32 4
+  %3 = load i32, i32 addrspace(2)* %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}s_load_imm_v8i32:
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) {
+entry:
+  %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
+  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
+  %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4
+  store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; CHECK-LABEL: {{^}}s_load_imm_v16i32:
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) {
+entry:
+  %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
+  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
+  %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4
+  store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/scalar_to_vector.ll b/test/CodeGen/AMDGPU/scalar_to_vector.ll
new file mode 100644
index 00000000000..0970e5d3063
--- /dev/null
+++ b/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -0,0 +1,81 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}scalar_to_vector_v2i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: s_endpgm
+define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tmp1 = load i32, i32 addrspace(1)* %in, align 4
+  %bc = bitcast i32 %tmp1 to <2 x i16>
+  %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_to_vector_v2f32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: s_endpgm
+define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tmp1 = load float, float addrspace(1)* %in, align 4
+  %bc = bitcast float %tmp1 to <2 x i16>
+  %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed
+; to produce one, but for some reason never made it to selection.
+
+
+; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+;   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
+;   %bc = bitcast i32 %tmp1 to <4 x i8>
+
+;   %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;   store <8 x i8> %tmp2, <8 x i8> addrspace(1)* %out, align 4
+;   ret void
+; }
+
+; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0
+;   %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1
+;   %bc = bitcast <2 x i64> %newvec1 to <4 x i32>
+;   %add = add <4 x i32> %bc, <i32 1, i32 2, i32 3, i32 4>
+;   store <4 x i32> %add, <4 x i32> addrspace(1)* %out, align 16
+;   ret void
+; }
+
+; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0
+;   %bc = bitcast <4 x i32> %newvec0 to <8 x i16>
+;   %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>
+;   store <8 x i16> %add, <8 x i16> addrspace(1)* %out, align 16
+;   ret void
+; }
+
+; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
+;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
+;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
+;   store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16
+;   ret void
+; }
+
+; define void @scalar_to_vector_test6(<4 x i16> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
+;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
+;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
+;   store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16
+;   ret void
+; }
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
new file mode 100644
index 00000000000..11e8f5176f4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
@@ -0,0 +1,82 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
+;REQUIRES: asserts
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #1 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = fcmp ult float %1, 0.000000e+00
+  %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00
+  %6 = fsub float -0.000000e+00, %5
+  %7 = fptosi float %6 to i32
+  %8 = bitcast i32 %7 to float
+  %9 = fcmp ult float %0, 5.700000e+01
+  %10 = select i1 %9, float 1.000000e+00, float 0.000000e+00
+  %11 = fsub float -0.000000e+00, %10
+  %12 = fptosi float %11 to i32
+  %13 = bitcast i32 %12 to float
+  %14 = bitcast float %8 to i32
+  %15 = bitcast float %13 to i32
+  %16 = and i32 %14, %15
+  %17 = bitcast i32 %16 to float
+  %18 = bitcast float %17 to i32
+  %19 = icmp ne i32 %18, 0
+  %20 = fcmp ult float %0, 0.000000e+00
+  %21 = select i1 %20, float 1.000000e+00, float 0.000000e+00
+  %22 = fsub float -0.000000e+00, %21
+  %23 = fptosi float %22 to i32
+  %24 = bitcast i32 %23 to float
+  %25 = bitcast float %24 to i32
+  %26 = icmp ne i32 %25, 0
+  br i1 %19, label %IF, label %ELSE
+
+IF:                                               ; preds = %main_body
+  %. = select i1 %26, float 0.000000e+00, float 1.000000e+00
+  %.18 = select i1 %26, float 1.000000e+00, float 0.000000e+00
+  br label %ENDIF
+
+ELSE:                                             ; preds = %main_body
+  br i1 %26, label %ENDIF, label %ELSE17
+
+ENDIF:                                            ; preds = %ELSE17, %ELSE, %IF
+  %temp1.0 = phi float [ %., %IF ], [ %48, %ELSE17 ], [ 0.000000e+00, %ELSE ]
+  %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %49, %ELSE17 ], [ 1.000000e+00, %ELSE ]
+  %temp.0 = phi float [ %.18, %IF ], [ %47, %ELSE17 ], [ 0.000000e+00, %ELSE ]
+  %27 = call float @llvm.AMDIL.clamp.(float %temp.0, float 0.000000e+00, float 1.000000e+00)
+  %28 = call float @llvm.AMDIL.clamp.(float %temp1.0, float 0.000000e+00, float 1.000000e+00)
+  %29 = call float @llvm.AMDIL.clamp.(float %temp2.0, float 0.000000e+00, float 1.000000e+00)
+  %30 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %31 = insertelement <4 x float> undef, float %27, i32 0
+  %32 = insertelement <4 x float> %31, float %28, i32 1
+  %33 = insertelement <4 x float> %32, float %29, i32 2
+  %34 = insertelement <4 x float> %33, float %30, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %34, i32 0, i32 0)
+  ret void
+
+ELSE17:                                           ; preds = %ELSE
+  %35 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %36 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %37 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %38 = fadd float %35, 0x3FC99999A0000000
+  %39 = fadd float %36, 0x3FC99999A0000000
+  %40 = fadd float %37, 0x3FC99999A0000000
+  %41 = fadd float %38, 0x3FC99999A0000000
+  %42 = fadd float %39, 0x3FC99999A0000000
+  %43 = fadd float %40, 0x3FC99999A0000000
+  %44 = fadd float %41, 0x3FC99999A0000000
+  %45 = fadd float %42, 0x3FC99999A0000000
+  %46 = fadd float %43, 0x3FC99999A0000000
+  %47 = fadd float %44, 0x3FC99999A0000000
+  %48 = fadd float %45, 0x3FC99999A0000000
+  %49 = fadd float %46, 0x3FC99999A0000000
+  br label %ENDIF
+}
+
+declare float @llvm.AMDIL.clamp.(float, float, float) #0
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { readnone }
+attributes #1 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
new file mode 100644
index 00000000000..759197ca61f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
@@ -0,0 +1,88 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
+;REQUIRES: asserts
+
+define void @main() {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(9)* null
+  %1 = extractelement <4 x float> %0, i32 3
+  %2 = fptosi float %1 to i32
+  %3 = bitcast i32 %2 to float
+  %4 = bitcast float %3 to i32
+  %5 = sdiv i32 %4, 4
+  %6 = bitcast i32 %5 to float
+  %7 = bitcast float %6 to i32
+  %8 = mul i32 %7, 4
+  %9 = bitcast i32 %8 to float
+  %10 = bitcast float %9 to i32
+  %11 = sub i32 0, %10
+  %12 = bitcast i32 %11 to float
+  %13 = bitcast float %3 to i32
+  %14 = bitcast float %12 to i32
+  %15 = add i32 %13, %14
+  %16 = bitcast i32 %15 to float
+  %17 = load <4 x float>, <4 x float> addrspace(9)* null
+  %18 = extractelement <4 x float> %17, i32 0
+  %19 = load <4 x float>, <4 x float> addrspace(9)* null
+  %20 = extractelement <4 x float> %19, i32 1
+  %21 = load <4 x float>, <4 x float> addrspace(9)* null
+  %22 = extractelement <4 x float> %21, i32 2
+  br label %LOOP
+
+LOOP:                                             ; preds = %IF31, %main_body
+  %temp12.0 = phi float [ 0.000000e+00, %main_body ], [ %47, %IF31 ]
+  %temp6.0 = phi float [ %22, %main_body ], [ %temp6.1, %IF31 ]
+  %temp5.0 = phi float [ %20, %main_body ], [ %temp5.1, %IF31 ]
+  %temp4.0 = phi float [ %18, %main_body ], [ %temp4.1, %IF31 ]
+  %23 = bitcast float %temp12.0 to i32
+  %24 = bitcast float %6 to i32
+  %25 = icmp sge i32 %23, %24
+  %26 = sext i1 %25 to i32
+  %27 = bitcast i32 %26 to float
+  %28 = bitcast float %27 to i32
+  %29 = icmp ne i32 %28, 0
+  br i1 %29, label %IF, label %LOOP29
+
+IF:                                               ; preds = %LOOP
+  %30 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
+  %31 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
+  %32 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
+  %33 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %34 = insertelement <4 x float> undef, float %30, i32 0
+  %35 = insertelement <4 x float> %34, float %31, i32 1
+  %36 = insertelement <4 x float> %35, float %32, i32 2
+  %37 = insertelement <4 x float> %36, float %33, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %37, i32 0, i32 0)
+  ret void
+
+LOOP29:                                           ; preds = %LOOP, %ENDIF30
+  %temp6.1 = phi float [ %temp4.1, %ENDIF30 ], [ %temp6.0, %LOOP ]
+  %temp5.1 = phi float [ %temp6.1, %ENDIF30 ], [ %temp5.0, %LOOP ]
+  %temp4.1 = phi float [ %temp5.1, %ENDIF30 ], [ %temp4.0, %LOOP ]
+  %temp20.0 = phi float [ %50, %ENDIF30 ], [ 0.000000e+00, %LOOP ]
+  %38 = bitcast float %temp20.0 to i32
+  %39 = bitcast float %16 to i32
+  %40 = icmp sge i32 %38, %39
+  %41 = sext i1 %40 to i32
+  %42 = bitcast i32 %41 to float
+  %43 = bitcast float %42 to i32
+  %44 = icmp ne i32 %43, 0
+  br i1 %44, label %IF31, label %ENDIF30
+
+IF31:                                             ; preds = %LOOP29
+  %45 = bitcast float %temp12.0 to i32
+  %46 = add i32 %45, 1
+  %47 = bitcast i32 %46 to float
+  br label %LOOP
+
+ENDIF30:                                          ; preds = %LOOP29
+  %48 = bitcast float %temp20.0 to i32
+  %49 = add i32 %48, 1
+  %50 = bitcast i32 %49 to float
+  br label %LOOP29
+}
+
+declare float @llvm.AMDIL.clamp.(float, float, float) #0
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop.ll b/test/CodeGen/AMDGPU/schedule-fs-loop.ll
new file mode 100644
index 00000000000..28cc08abc02
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop.ll
@@ -0,0 +1,55 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
+;REQUIRES: asserts
+
+define void @main() {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(9)* null
+  %1 = extractelement <4 x float> %0, i32 3
+  %2 = fptosi float %1 to i32
+  %3 = bitcast i32 %2 to float
+  %4 = load <4 x float>, <4 x float> addrspace(9)* null
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = load <4 x float>, <4 x float> addrspace(9)* null
+  %7 = extractelement <4 x float> %6, i32 1
+  %8 = load <4 x float>, <4 x float> addrspace(9)* null
+  %9 = extractelement <4 x float> %8, i32 2
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF, %main_body
+  %temp4.0 = phi float [ %5, %main_body ], [ %temp5.0, %ENDIF ]
+  %temp5.0 = phi float [ %7, %main_body ], [ %temp6.0, %ENDIF ]
+  %temp6.0 = phi float [ %9, %main_body ], [ %temp4.0, %ENDIF ]
+  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %27, %ENDIF ]
+  %10 = bitcast float %temp8.0 to i32
+  %11 = bitcast float %3 to i32
+  %12 = icmp sge i32 %10, %11
+  %13 = sext i1 %12 to i32
+  %14 = bitcast i32 %13 to float
+  %15 = bitcast float %14 to i32
+  %16 = icmp ne i32 %15, 0
+  br i1 %16, label %IF, label %ENDIF
+
+IF:                                               ; preds = %LOOP
+  %17 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
+  %18 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
+  %19 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
+  %20 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %21 = insertelement <4 x float> undef, float %17, i32 0
+  %22 = insertelement <4 x float> %21, float %18, i32 1
+  %23 = insertelement <4 x float> %22, float %19, i32 2
+  %24 = insertelement <4 x float> %23, float %20, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %24, i32 0, i32 0)
+  ret void
+
+ENDIF:                                            ; preds = %LOOP
+  %25 = bitcast float %temp8.0 to i32
+  %26 = add i32 %25, 1
+  %27 = bitcast i32 %26 to float
+  br label %LOOP
+}
+
+declare float @llvm.AMDIL.clamp.(float, float, float) #0
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-global-loads.ll b/test/CodeGen/AMDGPU/schedule-global-loads.ll
new file mode 100644
index 00000000000..3f728fd873b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-global-loads.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FIXME: This currently doesn't do a great job of clustering the
+; loads, which end up with extra moves between them. Right now, it
+; seems the only things areLoadsFromSameBasePtr is accomplishing is
+; ordering the loads so that the lower address loads come first.
+
+; FUNC-LABEL: {{^}}cluster_global_arg_loads:
+; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4
+; SI: buffer_store_dword [[REG0]]
+; SI: buffer_store_dword [[REG1]]
+define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
+  %load0 = load i32, i32 addrspace(1)* %ptr, align 4
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 1
+  %load1 = load i32, i32 addrspace(1)* %gep, align 4
+  store i32 %load0, i32 addrspace(1)* %out0, align 4
+  store i32 %load1, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; Test for a crach in SIInstrInfo::areLoadsFromSameBasePtr() when checking
+; an MUBUF load which does not have a vaddr operand.
+; FUNC-LABEL: {{^}}same_base_ptr_crash:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+entry:
+  %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
+  %tmp0 = load i32, i32 addrspace(1)* %out
+  %tmp1 = load i32, i32 addrspace(1)* %out1
+  %tmp2 = add i32 %tmp0, %tmp1
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-if-2.ll b/test/CodeGen/AMDGPU/schedule-if-2.ll
new file mode 100644
index 00000000000..54946509683
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-if-2.ll
@@ -0,0 +1,94 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
+;REQUIRES: asserts
+
+define void @main() {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = fadd float 1.000000e+03, %1
+  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %4 = extractelement <4 x float> %3, i32 0
+  %5 = bitcast float %4 to i32
+  %6 = icmp eq i32 %5, 0
+  %7 = sext i1 %6 to i32
+  %8 = bitcast i32 %7 to float
+  %9 = bitcast float %8 to i32
+  %10 = icmp ne i32 %9, 0
+  br i1 %10, label %IF, label %ELSE
+
+IF:                                               ; preds = %main_body
+  %11 = call float @fabs(float %2)
+  %12 = fcmp ueq float %11, 0x7FF0000000000000
+  %13 = select i1 %12, float 1.000000e+00, float 0.000000e+00
+  %14 = fsub float -0.000000e+00, %13
+  %15 = fptosi float %14 to i32
+  %16 = bitcast i32 %15 to float
+  %17 = bitcast float %16 to i32
+  %18 = icmp ne i32 %17, 0
+  %. = select i1 %18, float 0x36A0000000000000, float 0.000000e+00
+  %19 = fcmp une float %2, %2
+  %20 = select i1 %19, float 1.000000e+00, float 0.000000e+00
+  %21 = fsub float -0.000000e+00, %20
+  %22 = fptosi float %21 to i32
+  %23 = bitcast i32 %22 to float
+  %24 = bitcast float %23 to i32
+  %25 = icmp ne i32 %24, 0
+  %temp8.0 = select i1 %25, float 0x36A0000000000000, float 0.000000e+00
+  %26 = bitcast float %. to i32
+  %27 = sitofp i32 %26 to float
+  %28 = bitcast float %temp8.0 to i32
+  %29 = sitofp i32 %28 to float
+  %30 = fcmp ugt float %2, 0.000000e+00
+  %31 = select i1 %30, float 1.000000e+00, float %2
+  %32 = fcmp uge float %31, 0.000000e+00
+  %33 = select i1 %32, float %31, float -1.000000e+00
+  %34 = fadd float %33, 1.000000e+00
+  %35 = fmul float %34, 5.000000e-01
+  br label %ENDIF
+
+ELSE:                                             ; preds = %main_body
+  %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %37 = extractelement <4 x float> %36, i32 0
+  %38 = bitcast float %37 to i32
+  %39 = icmp eq i32 %38, 1
+  %40 = sext i1 %39 to i32
+  %41 = bitcast i32 %40 to float
+  %42 = bitcast float %41 to i32
+  %43 = icmp ne i32 %42, 0
+  br i1 %43, label %IF23, label %ENDIF
+
+ENDIF:                                            ; preds = %IF23, %ELSE, %IF
+  %temp4.0 = phi float [ %2, %IF ], [ %56, %IF23 ], [ 0.000000e+00, %ELSE ]
+  %temp5.0 = phi float [ %27, %IF ], [ %60, %IF23 ], [ 0.000000e+00, %ELSE ]
+  %temp6.0 = phi float [ %29, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ]
+  %temp7.0 = phi float [ %35, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ]
+  %44 = insertelement <4 x float> undef, float %temp4.0, i32 0
+  %45 = insertelement <4 x float> %44, float %temp5.0, i32 1
+  %46 = insertelement <4 x float> %45, float %temp6.0, i32 2
+  %47 = insertelement <4 x float> %46, float %temp7.0, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %47, i32 0, i32 0)
+  ret void
+
+IF23:                                             ; preds = %ELSE
+  %48 = fcmp ult float 0.000000e+00, %2
+  %49 = select i1 %48, float 1.000000e+00, float 0.000000e+00
+  %50 = fsub float -0.000000e+00, %49
+  %51 = fptosi float %50 to i32
+  %52 = bitcast i32 %51 to float
+  %53 = bitcast float %52 to i32
+  %54 = icmp ne i32 %53, 0
+  %.28 = select i1 %54, float 0x36A0000000000000, float 0.000000e+00
+  %55 = bitcast float %.28 to i32
+  %56 = sitofp i32 %55 to float
+  %57 = load <4 x float>, <4 x float> addrspace(8)* null
+  %58 = extractelement <4 x float> %57, i32 0
+  %59 = fsub float -0.000000e+00, %58
+  %60 = fadd float %2, %59
+  br label %ENDIF
+}
+
+declare float @fabs(float) #0
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { readonly }
diff --git a/test/CodeGen/AMDGPU/schedule-if.ll b/test/CodeGen/AMDGPU/schedule-if.ll
new file mode 100644
index 00000000000..94c653c8f25
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-if.ll
@@ -0,0 +1,46 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
+;REQUIRES: asserts
+
+define void @main() {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = bitcast float %1 to i32
+  %3 = icmp eq i32 %2, 0
+  %4 = sext i1 %3 to i32
+  %5 = bitcast i32 %4 to float
+  %6 = bitcast float %5 to i32
+  %7 = icmp ne i32 %6, 0
+  br i1 %7, label %ENDIF, label %ELSE
+
+ELSE:                                             ; preds = %main_body
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %9 = extractelement <4 x float> %8, i32 0
+  %10 = bitcast float %9 to i32
+  %11 = icmp eq i32 %10, 1
+  %12 = sext i1 %11 to i32
+  %13 = bitcast i32 %12 to float
+  %14 = bitcast float %13 to i32
+  %15 = icmp ne i32 %14, 0
+  br i1 %15, label %IF13, label %ENDIF
+
+ENDIF:                                            ; preds = %IF13, %ELSE, %main_body
+  %temp.0 = phi float [ 1.000000e+03, %main_body ], [ 1.000000e+00, %IF13 ], [ 0.000000e+00, %ELSE ]
+  %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ]
+  %temp3.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ]
+  %16 = insertelement <4 x float> undef, float %temp.0, i32 0
+  %17 = insertelement <4 x float> %16, float %temp1.0, i32 1
+  %18 = insertelement <4 x float> %17, float 0.000000e+00, i32 2
+  %19 = insertelement <4 x float> %18, float %temp3.0, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0)
+  ret void
+
+IF13:                                             ; preds = %ELSE
+  %20 = load <4 x float>, <4 x float> addrspace(8)* null
+  %21 = extractelement <4 x float> %20, i32 0
+  %22 = fsub float -0.000000e+00, %21
+  %23 = fadd float 1.000000e+03, %22
+  br label %ENDIF
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
new file mode 100644
index 00000000000..6b3e0814c38
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI --check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s
+
+; FUNC-LABEL: {{^}}cluster_arg_loads:
+; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
+; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI-NEXT: s_nop 0
+; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-NEXT: s_nop 0
+; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; VI-NEXT: s_nop 0
+; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
+define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
+  store i32 %x, i32 addrspace(1)* %out0, align 4
+  store i32 %y, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; Test for a crash in SIInstrInfo::areLoadsFromSameBasePtr() when
+; s_load_dwordx2 has a register offset
+
+; FUNC-LABEL: @same_base_ptr_crash
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_endpgm
+define void @same_base_ptr_crash(i64 addrspace(1)* %out,
+    i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7,
+    i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15,
+    i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23,
+    i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31,
+    i64 %arg32, i64 %arg33, i64 %arg34, i64 %arg35, i64 %arg36, i64 %arg37, i64 %arg38, i64 %arg39,
+    i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg45, i64 %arg46, i64 %arg47,
+    i64 %arg48, i64 %arg49, i64 %arg50, i64 %arg51, i64 %arg52, i64 %arg53, i64 %arg54, i64 %arg55,
+    i64 %arg56, i64 %arg57, i64 %arg58, i64 %arg59, i64 %arg60, i64 %arg61, i64 %arg62, i64 %arg63,
+    i64 %arg64, i64 %arg65, i64 %arg66, i64 %arg67, i64 %arg68, i64 %arg69, i64 %arg70, i64 %arg71,
+    i64 %arg72, i64 %arg73, i64 %arg74, i64 %arg75, i64 %arg76, i64 %arg77, i64 %arg78, i64 %arg79,
+    i64 %arg80, i64 %arg81, i64 %arg82, i64 %arg83, i64 %arg84, i64 %arg85, i64 %arg86, i64 %arg87,
+    i64 %arg88, i64 %arg89, i64 %arg90, i64 %arg91, i64 %arg92, i64 %arg93, i64 %arg94, i64 %arg95,
+    i64 %arg96, i64 %arg97, i64 %arg98, i64 %arg99, i64 %arg100, i64 %arg101, i64 %arg102, i64 %arg103,
+    i64 %arg104, i64 %arg105, i64 %arg106, i64 %arg107, i64 %arg108, i64 %arg109, i64 %arg110, i64 %arg111,
+    i64 %arg112, i64 %arg113, i64 %arg114, i64 %arg115, i64 %arg116, i64 %arg117, i64 %arg118, i64 %arg119,
+    i64 %arg120, i64 %arg121, i64 %arg122, i64 %arg123, i64 %arg124, i64 %arg125, i64 %arg126) {
+entry:
+  %value = add i64 %arg125, %arg126
+  store i64 %value, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
new file mode 100644
index 00000000000..3863afda5dd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
@@ -0,0 +1,163 @@
+; XFAIL: *
+; REQUIRES: asserts
+; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
+
+declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+
+
+; SI-LABEL: {{^}}main(
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 2
+  %2 = fcmp ult float %0, 0.000000e+00
+  %3 = select i1 %2, float 1.000000e+00, float 0.000000e+00
+  %4 = fsub float -0.000000e+00, %3
+  %5 = fptosi float %4 to i32
+  %6 = bitcast i32 %5 to float
+  %7 = bitcast float %6 to i32
+  %8 = icmp ne i32 %7, 0
+  br i1 %8, label %LOOP, label %ENDIF
+
+Flow1:                                            ; preds = %ENDIF19, %ENDIF16
+  %9 = phi float [ %115, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %10 = phi float [ %114, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %11 = phi float [ %113, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %12 = phi float [ %112, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %13 = phi float [ %111, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %14 = phi i1 [ false, %ENDIF19 ], [ true, %ENDIF16 ]
+  br label %Flow
+
+Flow2:                                            ; preds = %Flow
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %main_body, %Flow2
+  %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %104, %Flow2 ]
+  %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %103, %Flow2 ]
+  %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %102, %Flow2 ]
+  %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ]
+  %15 = extractelement <4 x float> %reg1, i32 1
+  %16 = extractelement <4 x float> %reg1, i32 3
+  %17 = load <4 x float>, <4 x float> addrspace(9)* null
+  %18 = extractelement <4 x float> %17, i32 0
+  %19 = fmul float %18, %0
+  %20 = load <4 x float>, <4 x float> addrspace(9)* null
+  %21 = extractelement <4 x float> %20, i32 1
+  %22 = fmul float %21, %0
+  %23 = load <4 x float>, <4 x float> addrspace(9)* null
+  %24 = extractelement <4 x float> %23, i32 2
+  %25 = fmul float %24, %0
+  %26 = load <4 x float>, <4 x float> addrspace(9)* null
+  %27 = extractelement <4 x float> %26, i32 3
+  %28 = fmul float %27, %0
+  %29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %30 = extractelement <4 x float> %29, i32 0
+  %31 = fmul float %30, %15
+  %32 = fadd float %31, %19
+  %33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %34 = extractelement <4 x float> %33, i32 1
+  %35 = fmul float %34, %15
+  %36 = fadd float %35, %22
+  %37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %38 = extractelement <4 x float> %37, i32 2
+  %39 = fmul float %38, %15
+  %40 = fadd float %39, %25
+  %41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %42 = extractelement <4 x float> %41, i32 3
+  %43 = fmul float %42, %15
+  %44 = fadd float %43, %28
+  %45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %46 = extractelement <4 x float> %45, i32 0
+  %47 = fmul float %46, %1
+  %48 = fadd float %47, %32
+  %49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %50 = extractelement <4 x float> %49, i32 1
+  %51 = fmul float %50, %1
+  %52 = fadd float %51, %36
+  %53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %54 = extractelement <4 x float> %53, i32 2
+  %55 = fmul float %54, %1
+  %56 = fadd float %55, %40
+  %57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %58 = extractelement <4 x float> %57, i32 3
+  %59 = fmul float %58, %1
+  %60 = fadd float %59, %44
+  %61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %62 = extractelement <4 x float> %61, i32 0
+  %63 = fmul float %62, %16
+  %64 = fadd float %63, %48
+  %65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %66 = extractelement <4 x float> %65, i32 1
+  %67 = fmul float %66, %16
+  %68 = fadd float %67, %52
+  %69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %70 = extractelement <4 x float> %69, i32 2
+  %71 = fmul float %70, %16
+  %72 = fadd float %71, %56
+  %73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %74 = extractelement <4 x float> %73, i32 3
+  %75 = fmul float %74, %16
+  %76 = fadd float %75, %60
+  %77 = insertelement <4 x float> undef, float %64, i32 0
+  %78 = insertelement <4 x float> %77, float %68, i32 1
+  %79 = insertelement <4 x float> %78, float %72, i32 2
+  %80 = insertelement <4 x float> %79, float %76, i32 3
+  call void @llvm.AMDGPU.barrier.local()
+  %81 = insertelement <4 x float> undef, float %temp.0, i32 0
+  %82 = insertelement <4 x float> %81, float %temp1.0, i32 1
+  %83 = insertelement <4 x float> %82, float %temp2.0, i32 2
+  %84 = insertelement <4 x float> %83, float %temp3.0, i32 3
+  call void @llvm.AMDGPU.barrier.local()
+  ret void
+
+LOOP:                                             ; preds = %main_body, %Flow
+  %temp.1 = phi float [ %109, %Flow ], [ 0.000000e+00, %main_body ]
+  %temp1.1 = phi float [ %108, %Flow ], [ 1.000000e+00, %main_body ]
+  %temp2.1 = phi float [ %107, %Flow ], [ 0.000000e+00, %main_body ]
+  %temp3.1 = phi float [ %106, %Flow ], [ 0.000000e+00, %main_body ]
+  %temp4.0 = phi float [ %105, %Flow ], [ -2.000000e+00, %main_body ]
+  %85 = fcmp uge float %temp4.0, %0
+  %86 = select i1 %85, float 1.000000e+00, float 0.000000e+00
+  %87 = fsub float -0.000000e+00, %86
+  %88 = fptosi float %87 to i32
+  %89 = bitcast i32 %88 to float
+  %90 = bitcast float %89 to i32
+  %91 = icmp ne i32 %90, 0
+  %92 = xor i1 %91, true
+  br i1 %92, label %ENDIF16, label %Flow
+
+ENDIF16:                                          ; preds = %LOOP
+  %93 = fcmp une float %1, %temp4.0
+  %94 = select i1 %93, float 1.000000e+00, float 0.000000e+00
+  %95 = fsub float -0.000000e+00, %94
+  %96 = fptosi float %95 to i32
+  %97 = bitcast i32 %96 to float
+  %98 = bitcast float %97 to i32
+  %99 = icmp ne i32 %98, 0
+  %100 = xor i1 %99, true
+  br i1 %100, label %ENDIF19, label %Flow1
+
+Flow:                                             ; preds = %Flow1, %LOOP
+  %101 = phi float [ %temp3.1, %Flow1 ], [ %temp3.1, %LOOP ]
+  %102 = phi float [ %temp2.1, %Flow1 ], [ %temp2.1, %LOOP ]
+  %103 = phi float [ %temp1.1, %Flow1 ], [ %temp1.1, %LOOP ]
+  %104 = phi float [ %temp.1, %Flow1 ], [ %temp.1, %LOOP ]
+  %105 = phi float [ %9, %Flow1 ], [ undef, %LOOP ]
+  %106 = phi float [ %10, %Flow1 ], [ undef, %LOOP ]
+  %107 = phi float [ %11, %Flow1 ], [ undef, %LOOP ]
+  %108 = phi float [ %12, %Flow1 ], [ undef, %LOOP ]
+  %109 = phi float [ %13, %Flow1 ], [ undef, %LOOP ]
+  %110 = phi i1 [ %14, %Flow1 ], [ true, %LOOP ]
+  br i1 %110, label %Flow2, label %LOOP
+
+ENDIF19:                                          ; preds = %ENDIF16
+  %111 = fadd float %temp.1, 1.000000e+00
+  %112 = fadd float %temp1.1, 0.000000e+00
+  %113 = fadd float %temp2.1, 0.000000e+00
+  %114 = fadd float %temp3.1, 0.000000e+00
+  %115 = fadd float %temp4.0, 1.000000e+00
+  br label %Flow1
+}
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
new file mode 100644
index 00000000000..8d980dbf899
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
@@ -0,0 +1,132 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;REQUIRES: asserts
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = fcmp ult float %0, 0.000000e+00
+  %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00
+  %6 = fsub float -0.000000e+00, %5
+  %7 = fptosi float %6 to i32
+  %8 = bitcast i32 %7 to float
+  %9 = bitcast float %8 to i32
+  %10 = icmp ne i32 %9, 0
+  br i1 %10, label %LOOP, label %ENDIF
+
+ENDIF:                                            ; preds = %ENDIF16, %LOOP, %main_body
+  %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %temp.1, %LOOP ], [ %temp.1, %ENDIF16 ]
+  %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %temp1.1, %LOOP ], [ %temp1.1, %ENDIF16 ]
+  %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %temp2.1, %LOOP ], [ %temp2.1, %ENDIF16 ]
+  %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %temp3.1, %LOOP ], [ %temp3.1, %ENDIF16 ]
+  %11 = load <4 x float>, <4 x float> addrspace(9)* null
+  %12 = extractelement <4 x float> %11, i32 0
+  %13 = fmul float %12, %0
+  %14 = load <4 x float>, <4 x float> addrspace(9)* null
+  %15 = extractelement <4 x float> %14, i32 1
+  %16 = fmul float %15, %0
+  %17 = load <4 x float>, <4 x float> addrspace(9)* null
+  %18 = extractelement <4 x float> %17, i32 2
+  %19 = fmul float %18, %0
+  %20 = load <4 x float>, <4 x float> addrspace(9)* null
+  %21 = extractelement <4 x float> %20, i32 3
+  %22 = fmul float %21, %0
+  %23 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %24 = extractelement <4 x float> %23, i32 0
+  %25 = fmul float %24, %1
+  %26 = fadd float %25, %13
+  %27 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %28 = extractelement <4 x float> %27, i32 1
+  %29 = fmul float %28, %1
+  %30 = fadd float %29, %16
+  %31 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %32 = extractelement <4 x float> %31, i32 2
+  %33 = fmul float %32, %1
+  %34 = fadd float %33, %19
+  %35 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %36 = extractelement <4 x float> %35, i32 3
+  %37 = fmul float %36, %1
+  %38 = fadd float %37, %22
+  %39 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %40 = extractelement <4 x float> %39, i32 0
+  %41 = fmul float %40, %2
+  %42 = fadd float %41, %26
+  %43 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %44 = extractelement <4 x float> %43, i32 1
+  %45 = fmul float %44, %2
+  %46 = fadd float %45, %30
+  %47 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %48 = extractelement <4 x float> %47, i32 2
+  %49 = fmul float %48, %2
+  %50 = fadd float %49, %34
+  %51 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %52 = extractelement <4 x float> %51, i32 3
+  %53 = fmul float %52, %2
+  %54 = fadd float %53, %38
+  %55 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %56 = extractelement <4 x float> %55, i32 0
+  %57 = fmul float %56, %3
+  %58 = fadd float %57, %42
+  %59 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %60 = extractelement <4 x float> %59, i32 1
+  %61 = fmul float %60, %3
+  %62 = fadd float %61, %46
+  %63 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %64 = extractelement <4 x float> %63, i32 2
+  %65 = fmul float %64, %3
+  %66 = fadd float %65, %50
+  %67 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %68 = extractelement <4 x float> %67, i32 3
+  %69 = fmul float %68, %3
+  %70 = fadd float %69, %54
+  %71 = insertelement <4 x float> undef, float %58, i32 0
+  %72 = insertelement <4 x float> %71, float %62, i32 1
+  %73 = insertelement <4 x float> %72, float %66, i32 2
+  %74 = insertelement <4 x float> %73, float %70, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %74, i32 60, i32 1)
+  %75 = insertelement <4 x float> undef, float %temp.0, i32 0
+  %76 = insertelement <4 x float> %75, float %temp1.0, i32 1
+  %77 = insertelement <4 x float> %76, float %temp2.0, i32 2
+  %78 = insertelement <4 x float> %77, float %temp3.0, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %78, i32 0, i32 2)
+  ret void
+
+LOOP:                                             ; preds = %main_body, %ENDIF19
+  %temp.1 = phi float [ %93, %ENDIF19 ], [ 0.000000e+00, %main_body ]
+  %temp1.1 = phi float [ %94, %ENDIF19 ], [ 1.000000e+00, %main_body ]
+  %temp2.1 = phi float [ %95, %ENDIF19 ], [ 0.000000e+00, %main_body ]
+  %temp3.1 = phi float [ %96, %ENDIF19 ], [ 0.000000e+00, %main_body ]
+  %temp4.0 = phi float [ %97, %ENDIF19 ], [ -2.000000e+00, %main_body ]
+  %79 = fcmp uge float %temp4.0, %0
+  %80 = select i1 %79, float 1.000000e+00, float 0.000000e+00
+  %81 = fsub float -0.000000e+00, %80
+  %82 = fptosi float %81 to i32
+  %83 = bitcast i32 %82 to float
+  %84 = bitcast float %83 to i32
+  %85 = icmp ne i32 %84, 0
+  br i1 %85, label %ENDIF, label %ENDIF16
+
+ENDIF16:                                          ; preds = %LOOP
+  %86 = fcmp une float %2, %temp4.0
+  %87 = select i1 %86, float 1.000000e+00, float 0.000000e+00
+  %88 = fsub float -0.000000e+00, %87
+  %89 = fptosi float %88 to i32
+  %90 = bitcast i32 %89 to float
+  %91 = bitcast float %90 to i32
+  %92 = icmp ne i32 %91, 0
+  br i1 %92, label %ENDIF, label %ENDIF19
+
+ENDIF19:                                          ; preds = %ENDIF16
+  %93 = fadd float %temp.1, 1.000000e+00
+  %94 = fadd float %temp1.1, 0.000000e+00
+  %95 = fadd float %temp2.1, 0.000000e+00
+  %96 = fadd float %temp3.1, 0.000000e+00
+  %97 = fadd float %temp4.0, 1.000000e+00
+  br label %LOOP
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll
new file mode 100644
index 00000000000..56088718ada
--- /dev/null
+++ b/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -0,0 +1,87 @@
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
+
+; When a frame index offset is more than 12-bits, make sure we don't store
+; it in mubuf's offset field.
+
+; Also, make sure we use the same register for storing the scratch buffer addresss
+; for both stores. This register is allocated by the register scavenger, so we
+; should be able to reuse the same regiser for each scratch buffer access.
+
+; CHECK-LABEL: {{^}}legal_offset_fi:
+; CHECK: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0{{$}}
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
+; CHECK: v_mov_b32_e32 [[OFFSET]], 0x8000
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
+
+define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
+entry:
+  %scratch0 = alloca [8192 x i32]
+  %scratch1 = alloca [8192 x i32]
+
+  %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 0
+  store i32 1, i32* %scratchptr0
+
+  %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 0
+  store i32 2, i32* %scratchptr1
+
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %if, label %else
+
+if:
+  %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset
+  %if_value = load i32, i32* %if_ptr
+  br label %done
+
+else:
+  %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset
+  %else_value = load i32, i32* %else_ptr
+  br label %done
+
+done:
+  %value = phi i32 [%if_value, %if], [%else_value, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+
+  ret void
+
+}
+
+; CHECK-LABEL: {{^}}legal_offset_fi_offset
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
+; CHECK: v_add_i32_e32 [[OFFSET:v[0-9]+]], 0x8000
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
+
+define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
+entry:
+  %scratch0 = alloca [8192 x i32]
+  %scratch1 = alloca [8192 x i32]
+
+  %offset0 = load i32, i32 addrspace(1)* %offsets
+  %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %offset0
+  store i32 %offset0, i32* %scratchptr0
+
+  %offsetptr1 = getelementptr i32, i32 addrspace(1)* %offsets, i32 1
+  %offset1 = load i32, i32 addrspace(1)* %offsetptr1
+  %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %offset1
+  store i32 %offset1, i32* %scratchptr1
+
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %if, label %else
+
+if:
+  %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset
+  %if_value = load i32, i32* %if_ptr
+  br label %done
+
+else:
+  %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset
+  %else_value = load i32, i32* %else_ptr
+  br label %done
+
+done:
+  %value = phi i32 [%if_value, %if], [%else_value, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll
new file mode 100644
index 00000000000..de645353a40
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sdiv.ll
@@ -0,0 +1,104 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; The code generated by sdiv is long and complex and may frequently change.
+; The goal of this test is to make sure the ISel doesn't fail.
+;
+; This program was previously failing to compile when one of the selectcc
+; opcodes generated by the sdiv lowering was being legalized and optimized to:
+; selectcc Remainder -1, 0, -1, SETGT
+; This was fixed by adding an additional pattern in R600Instructions.td to
+; match this pattern with a CNDGE_INT.
+
+; FUNC-LABEL: {{^}}sdiv_i32:
+; EG: CF_END
+define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in
+  %den = load i32, i32 addrspace(1) * %den_ptr
+  %result = sdiv i32 %num, %den
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv_i32_4:
+define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32, i32 addrspace(1) * %in
+  %result = sdiv i32 %num, 4
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; Multiply by a weird constant to make sure setIntDivIsCheap is
+; working.
+
+; FUNC-LABEL: {{^}}slow_sdiv_i32_3435:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
+; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]]
+; SI: v_add_i32
+; SI: v_lshrrev_b32
+; SI: v_ashrrev_i32
+; SI: v_add_i32
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32, i32 addrspace(1) * %in
+  %result = sdiv i32 %num, 3435
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
+  %result = sdiv <2 x i32> %num, %den
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %result = sdiv <2 x i32> %num, <i32 4, i32 4>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
+  %result = sdiv <4 x i32> %num, %den
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; Tests for 64-bit divide bypass.
+; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+;   %result = sdiv i64 %a, %b
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+;   %result = srem i64 %a, %b
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+;   %resultdiv = sdiv i64 %a, %b
+;   %resultrem = srem i64 %a, %b
+;   %result = add i64 %resultdiv, %resultrem
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
diff --git a/test/CodeGen/AMDGPU/sdivrem24.ll b/test/CodeGen/AMDGPU/sdivrem24.ll
new file mode 100644
index 00000000000..ad5df39f550
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sdivrem24.ll
@@ -0,0 +1,239 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}sdiv24_i8:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %num = load i8, i8 addrspace(1) * %in
+  %den = load i8, i8 addrspace(1) * %den_ptr
+  %result = sdiv i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv24_i16:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+  %num = load i16, i16 addrspace(1) * %in, align 2
+  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
+  %result = sdiv i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv24_i32:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv25_i32:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_sdiv24_i32_1:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_sdiv24_i32_2:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem24_i8:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %num = load i8, i8 addrspace(1) * %in
+  %den = load i8, i8 addrspace(1) * %den_ptr
+  %result = srem i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem24_i16:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+  %num = load i16, i16 addrspace(1) * %in, align 2
+  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
+  %result = srem i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem24_i32:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem25_i32:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_srem24_i32_1:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_srem24_i32_2:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_srem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sdivrem64.ll b/test/CodeGen/AMDGPU/sdivrem64.ll
new file mode 100644
index 00000000000..a9b2b7f9df5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sdivrem64.ll
@@ -0,0 +1,225 @@
+;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+
+;FUNC-LABEL: {{^}}test_sdiv:
+;EG: RECIP_UINT
+;EG: LSHL {{.*}}, 1,
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = sdiv i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_srem:
+;EG: RECIP_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: AND_INT {{.*}}, 1,
+
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = urem i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_sdiv3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 33
+  %2 = ashr i64 %y, 33
+  %result = sdiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_srem3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 33
+  %2 = ashr i64 %y, 33
+  %result = srem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_sdiv2464:
+;EG: INT_TO_FLT
+;EG: INT_TO_FLT
+;EG: FLT_TO_INT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 40
+  %2 = ashr i64 %y, 40
+  %result = sdiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_srem2464:
+;EG: INT_TO_FLT
+;EG: INT_TO_FLT
+;EG: FLT_TO_INT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 40
+  %2 = ashr i64 %y, 40
+  %result = srem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/select-i1.ll b/test/CodeGen/AMDGPU/select-i1.ll
new file mode 100644
index 00000000000..6735394e93a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-i1.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI
+
+; FUNC-LABEL: {{^}}select_i1:
+; SI: v_cndmask_b32
+; SI-NOT: v_cndmask_b32
+define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %sel = select i1 %cmp, i1 %a, i1 %b
+  store i1 %sel, i1 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll
new file mode 100644
index 00000000000..59082c65cc8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -0,0 +1,156 @@
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; Test expansion of scalar selects on vectors.
+; Evergreen not enabled since it seems to be having problems with doubles.
+
+
+; FUNC-LABEL: {{^}}select_v4i8:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
+  %cmp = icmp eq i8 %c, 0
+  %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
+  store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v4i16:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
+  store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v2i32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: buffer_store_dwordx2
+define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
+  store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v4i32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: buffer_store_dwordx4
+define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
+  store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v8i32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
+  store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v2f32:
+; SI: buffer_store_dwordx2
+define void @select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
+  store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v4f32:
+; SI: buffer_store_dwordx4
+define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
+  store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v8f32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
+  store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v2f64:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
+  store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v4f64:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
+  store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v8f64:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
+  store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/select.ll b/test/CodeGen/AMDGPU/select.ll
new file mode 100644
index 00000000000..45f3cd5a7ac
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; Normally icmp + select is optimized to select_cc, when this happens the
+; DAGLegalizer never sees the select and doesn't have a chance to leaglize it.
+;
+; In order to avoid the select_cc optimization, this test case calculates the
+; condition for the select in a separate basic block.
+
+; FUNC-LABEL: {{^}}select:
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
+define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
+                     <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out,
+                     <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out,
+                     i32 %cond) {
+entry:
+  br label %for
+body:
+  %inc = add i32 %i, 1
+  %br_cmp.i = icmp eq i1 %br_cmp, 0
+  br label %for
+for:
+  %i = phi i32 [ %inc, %body], [ 0, %entry ]
+  %br_cmp = phi i1 [ %br_cmp.i, %body ], [ 0, %entry ]
+  %0 = icmp eq i32 %cond, %i
+  %1 = select i1 %br_cmp, i32 2, i32 3
+  %2 = select i1 %br_cmp, float 2.0 , float 5.0
+  %3 = select i1 %br_cmp, <2 x i32> <i32 2, i32 3>, <2 x i32> <i32 4, i32 5>
+  %4 = select i1 %br_cmp, <2 x float> <float 2.0, float 3.0>, <2 x float> <float 4.0, float 5.0>
+  %5 = select i1 %br_cmp, <4 x i32> <i32 2 , i32 3, i32 4, i32 5>, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+  %6 = select i1 %br_cmp, <4 x float> <float 2.0, float 3.0, float 4.0, float 5.0>, <4 x float> <float 6.0, float 7.0, float 8.0, float 9.0>
+  br i1 %0, label %body, label %done
+
+done:
+  store i32 %1, i32 addrspace(1)* %i32out
+  store float %2, float addrspace(1)* %f32out
+  store <2 x i32> %3, <2 x i32> addrspace(1)* %v2i32out
+  store <2 x float> %4, <2 x float> addrspace(1)* %v2f32out
+  store <4 x i32> %5, <4 x i32> addrspace(1)* %v4i32out
+  store <4 x float> %6, <4 x float> addrspace(1)* %v4f32out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/select64.ll b/test/CodeGen/AMDGPU/select64.ll
new file mode 100644
index 00000000000..5cebb30dc72
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select64.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: {{^}}select0:
+; i64 select should be split into two i32 selects, and we shouldn't need
+; to use a shfit to extract the hi dword of the input.
+; CHECK-NOT: s_lshr_b64
+; CHECK: v_cndmask
+; CHECK: v_cndmask
+define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
+entry:
+  %0 = icmp ugt i32 %cond, 5
+  %1 = select i1 %0, i64 0, i64 %in
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}select_trunc_i64:
+; CHECK: v_cndmask_b32
+; CHECK-NOT: v_cndmask_b32
+define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %sel = select i1 %cmp, i64 0, i64 %in
+  %trunc = trunc i64 %sel to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}select_trunc_i64_2:
+; CHECK: v_cndmask_b32
+; CHECK-NOT: v_cndmask_b32
+define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  %trunc = trunc i64 %sel to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_select_trunc_i64_2:
+; CHECK: v_cndmask_b32
+; CHECK-NOT: v_cndmask_b32
+define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  %trunc = trunc i64 %sel to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_select_i64_split_imm:
+; CHECK: s_mov_b32 [[SHI:s[0-9]+]], 63
+; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0
+; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]]
+; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]]
+; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}}
+; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}}
+; CHECK: s_endpgm
+define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %sel = select i1 %cmp, i64 %a, i64 270582939648 ; 63 << 32
+  store i64 %sel, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/selectcc-cnd.ll b/test/CodeGen/AMDGPU/selectcc-cnd.ll
new file mode 100644
index 00000000000..94d0ace7569
--- /dev/null
+++ b/test/CodeGen/AMDGPU/selectcc-cnd.ll
@@ -0,0 +1,12 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK-NOT: SETE
+;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x,
+;CHECK: 1073741824
+define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %1 = load float, float addrspace(1)* %in
+  %2 = fcmp oeq float %1, 0.0
+  %3 = select i1 %2, float 1.0, float 2.0
+  store float %3, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/selectcc-cnde-int.ll b/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
new file mode 100644
index 00000000000..58a4ee7d62b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
@@ -0,0 +1,12 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK-NOT: SETE_INT
+;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x,
+;CHECK-NEXT: 2
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %1 = load i32, i32 addrspace(1)* %in
+  %2 = icmp eq i32 %1, 0
+  %3 = select i1 %2, i32 1, i32 2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll b/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
new file mode 100644
index 00000000000..e870ee891e6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Note additional optimizations may cause this SGT to be replaced with a
+; CND* instruction.
+; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, literal.x,
+; CHECK-NEXT: -1
+; Test a selectcc with i32 LHS/RHS and float True/False
+
+define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(1)* %in
+  %1 = icmp sge i32 %0, 0
+  %2 = select i1 %1, float 1.0, float 0.0
+  store float %2, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/selectcc-opt.ll b/test/CodeGen/AMDGPU/selectcc-opt.ll
new file mode 100644
index 00000000000..65be4a626a1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/selectcc-opt.ll
@@ -0,0 +1,80 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}test_a:
+; EG-NOT: CND
+; EG: SET{{[NEQGTL]+}}_DX10
+
+define void @test_a(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp olt float %in, 0.000000e+00
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  %4 = bitcast i32 %3 to float
+  %5 = bitcast float %4 to i32
+  %6 = icmp ne i32 %5, 0
+  br i1 %6, label %IF, label %ENDIF
+
+IF:
+  %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  store i32 0, i32 addrspace(1)* %7
+  br label %ENDIF
+
+ENDIF:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Same as test_a, but the branch labels are swapped to produce the inverse cc
+; for the icmp instruction
+
+; EG-LABEL: {{^}}test_b:
+; EG: SET{{[GTEQN]+}}_DX10
+; EG-NEXT: PRED_
+; EG-NEXT: ALU clause starting
+define void @test_b(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp olt float %in, 0.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  %4 = bitcast i32 %3 to float
+  %5 = bitcast float %4 to i32
+  %6 = icmp ne i32 %5, 0
+  br i1 %6, label %ENDIF, label %IF
+
+IF:
+  %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  store i32 0, i32 addrspace(1)* %7
+  br label %ENDIF
+
+ENDIF:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test a CND*_INT instruction with float true/false values
+; EG-LABEL: {{^}}test_c:
+; EG: CND{{[GTE]+}}_INT
+define void @test_c(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  %1 = select i1 %0, float 2.0, float 3.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}selectcc_bool:
+; SI: v_cmp_ne_i32
+; SI-NEXT: v_cndmask_b32_e64
+; SI-NOT: cmp
+; SI-NOT: cndmask
+define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = select i1 %icmp0, i32 -1, i32 0
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/selectcc.ll b/test/CodeGen/AMDGPU/selectcc.ll
new file mode 100644
index 00000000000..f378e15dd76
--- /dev/null
+++ b/test/CodeGen/AMDGPU/selectcc.ll
@@ -0,0 +1,20 @@
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}selectcc_i64:
+; EG: XOR_INT
+; EG: XOR_INT
+; EG: OR_INT
+; EG: CNDE_INT
+; EG: CNDE_INT
+; SI: v_cmp_eq_i64
+; SI: v_cndmask
+; SI: v_cndmask
+define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
+entry:
+  %0 = icmp eq i64 %lhs, %rhs
+  %1 = select i1 %0, i64 %true, i64 %false
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/set-dx10.ll b/test/CodeGen/AMDGPU/set-dx10.ll
new file mode 100644
index 00000000000..53694dcffa6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/set-dx10.ll
@@ -0,0 +1,161 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; These tests check that floating point comparisons which are used by select
+; to store integer true (-1) and false (0) values are lowered to one of the
+; SET*DX10 instructions.
+
+; CHECK: {{^}}fcmp_une_select_fptosi:
+; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp une float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_une_select_i32:
+; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp une float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_oeq_select_fptosi:
+; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp oeq float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_oeq_select_i32:
+; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp oeq float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_ogt_select_fptosi:
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ogt float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_ogt_select_i32:
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ogt float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_oge_select_fptosi:
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp oge float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_oge_select_i32:
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp oge float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_ole_select_fptosi:
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ole float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_ole_select_i32:
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ole float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_olt_select_fptosi:
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp olt float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_olt_select_i32:
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp olt float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/setcc-equivalent.ll b/test/CodeGen/AMDGPU/setcc-equivalent.ll
new file mode 100644
index 00000000000..11ea793650c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/setcc-equivalent.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+
+; EG-LABEL: {{^}}and_setcc_setcc_i32:
+; EG: AND_INT
+; EG-NEXT: SETE_INT
+define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %cmp1 = icmp eq i32 %a, -1
+  %cmp2 = icmp eq i32 %b, -1
+  %and = and i1 %cmp1, %cmp2
+  %ext = sext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-LABEL: {{^}}and_setcc_setcc_v4i32:
+; EG: AND_INT
+; EG: AND_INT
+; EG: SETE_INT
+; EG: AND_INT
+; EG: SETE_INT
+; EG: AND_INT
+; EG: SETE_INT
+define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+  %cmp1 = icmp eq <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %cmp2 = icmp eq <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and = and <4 x i1> %cmp1, %cmp2
+  %ext = sext <4 x i1> %and to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/setcc-opt.ll b/test/CodeGen/AMDGPU/setcc-opt.ll
new file mode 100644
index 00000000000..4e6a10d6b78
--- /dev/null
+++ b/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -0,0 +1,236 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT:buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+
+; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
+; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
+define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp eq i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 0
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_0:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+
+; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
+; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
+define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 0
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; This really folds away to false
+; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1:
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
+; GCN-NEXT: buffer_store_byte [[TMP]]
+; GCN-NEXT: s_endpgm
+define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp eq i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; This really folds away to true
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1:
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
+; GCN-NEXT: v_cmp_ne_i32_e32 vcc, 1, [[TMP]]{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
+; GCN-NEXT: buffer_store_byte [[TMP]]
+; GCN-NEXT: s_endpgm
+define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_eq_0:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp eq i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 0
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_ne_0:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 0
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_eq_1:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp eq i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_ne_1:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+; GCN: v_cmp_ne_i32_e32 vcc, 2, [[VB]]{{$}}
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: buffer_store_byte
+; GCN: s_endpgm
+define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 2
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cmp_zext_k_i8max:
+; GCN: buffer_load_ubyte [[B:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; GCN: v_mov_b32_e32 [[K255:v[0-9]+]], 0xff{{$}}
+; GCN: v_cmp_ne_i32_e32 vcc, [[K255]], [[B]]
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
+  %b.ext = zext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, 255
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cmp_sext_k_neg1:
+; GCN: buffer_load_sbyte [[B:v[0-9]+]]
+; GCN: v_cmp_ne_i32_e32 vcc, -1, [[B]]{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
+  %b = load i8, i8 addrspace(1)* %b.ptr
+  %b.ext = sext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, -1
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_sext_arg:
+; GCN: s_load_dword [[B:s[0-9]+]]
+; GCN: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -1, [[B]]
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
+  %b.ext = sext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, -1
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This ends up doing a buffer_load_ubyte, and and compare to
+; 255. Seems to be because of ordering problems when not allowing load widths to be reduced.
+; Should do a buffer_load_sbyte and compare with -1
+
+; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg:
+; GCN-DAG: buffer_load_ubyte [[B:v[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff{{$}}
+; GCN: v_cmp_ne_i32_e32 vcc, [[K]], [[B]]{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
+  %b.ext = sext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, -1
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cmp_zext_k_neg1:
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
+  %b.ext = zext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, -1
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_ne_k:
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 2
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_eq_k:
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 2
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll
new file mode 100644
index 00000000000..f33a82df5ff
--- /dev/null
+++ b/test/CodeGen/AMDGPU/setcc.ll
@@ -0,0 +1,377 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: {{^}}setcc_v2i32:
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
+
+define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+  %result = icmp eq <2 x i32> %a, %b
+  %sext = sext <2 x i1> %result to <2 x i32>
+  store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}setcc_v4i32:
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %result to <4 x i32>
+  store <4 x i32> %sext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+;;;==========================================================================;;;
+;; Float comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: {{^}}f32_oeq:
+; R600: SETE_DX10
+; SI: v_cmp_eq_f32
+define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp oeq float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_ogt:
+; R600: SETGT_DX10
+; SI: v_cmp_gt_f32
+define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ogt float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_oge:
+; R600: SETGE_DX10
+; SI: v_cmp_ge_f32
+define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp oge float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_olt:
+; R600: SETGT_DX10
+; SI: v_cmp_lt_f32
+define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp olt float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_ole:
+; R600: SETGE_DX10
+; SI: v_cmp_le_f32
+define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ole float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_one:
+; R600-DAG: SETE_DX10
+; R600-DAG: SETE_DX10
+; R600-DAG: AND_INT
+; R600-DAG: SETNE_DX10
+; R600-DAG: AND_INT
+; R600-DAG: SETNE_INT
+
+; SI: v_cmp_lg_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp one float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_ord:
+; R600-DAG: SETE_DX10
+; R600-DAG: SETE_DX10
+; R600-DAG: AND_INT
+; R600-DAG: SETNE_INT
+; SI: v_cmp_o_f32
+define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ord float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_ueq:
+; R600-DAG: SETNE_DX10
+; R600-DAG: SETNE_DX10
+; R600-DAG: OR_INT
+; R600-DAG: SETE_DX10
+; R600-DAG: OR_INT
+; R600-DAG: SETNE_INT
+
+; SI: v_cmp_nlg_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ueq float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_ugt:
+; R600: SETGE
+; R600: SETE_DX10
+; SI: v_cmp_nle_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ugt float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_uge:
+; R600: SETGT
+; R600: SETE_DX10
+
+; SI: v_cmp_nlt_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp uge float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_ult:
+; R600: SETGE
+; R600: SETE_DX10
+
+; SI: v_cmp_nge_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ult float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_ule:
+; R600: SETGT
+; R600: SETE_DX10
+
+; SI: v_cmp_ngt_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ule float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_une:
+; R600: SETNE_DX10
+; SI: v_cmp_neq_f32
+define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp une float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_uno:
+; R600: SETNE_DX10
+; R600: SETNE_DX10
+; R600: OR_INT
+; R600: SETNE_INT
+; SI: v_cmp_u_f32
+define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp uno float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+;;;==========================================================================;;;
+;; 32-bit integer comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: {{^}}i32_eq:
+; R600: SETE_INT
+; SI: v_cmp_eq_i32
+define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp eq i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_ne:
+; R600: SETNE_INT
+; SI: v_cmp_ne_i32
+define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ne i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_ugt:
+; R600: SETGT_UINT
+; SI: v_cmp_gt_u32
+define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ugt i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_uge:
+; R600: SETGE_UINT
+; SI: v_cmp_ge_u32
+define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp uge i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_ult:
+; R600: SETGT_UINT
+; SI: v_cmp_lt_u32
+define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ult i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_ule:
+; R600: SETGE_UINT
+; SI: v_cmp_le_u32
+define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ule i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_sgt:
+; R600: SETGT_INT
+; SI: v_cmp_gt_i32
+define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp sgt i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_sge:
+; R600: SETGE_INT
+; SI: v_cmp_ge_i32
+define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp sge i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_slt:
+; R600: SETGT_INT
+; SI: v_cmp_lt_i32
+define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp slt i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_sle:
+; R600: SETGE_INT
+; SI: v_cmp_le_i32
+define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp sle i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This does 4 compares
+; FUNC-LABEL: {{^}}v3i32_eq:
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI: s_endpgm
+define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid
+  %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid
+  %gep.out = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid
+  %a = load <3 x i32>, <3 x i32> addrspace(1)* %gep.a
+  %b = load <3 x i32>, <3 x i32> addrspace(1)* %gep.b
+  %cmp = icmp eq <3 x i32> %a, %b
+  %ext = sext <3 x i1> %cmp to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v3i8_eq:
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI: s_endpgm
+define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid
+  %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid
+  %gep.out = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %out, i32 %tid
+  %a = load <3 x i8>, <3 x i8> addrspace(1)* %gep.a
+  %b = load <3 x i8>, <3 x i8> addrspace(1)* %gep.b
+  %cmp = icmp eq <3 x i8> %a, %b
+  %ext = sext <3 x i1> %cmp to <3 x i8>
+  store <3 x i8> %ext, <3 x i8> addrspace(1)* %gep.out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/setcc64.ll b/test/CodeGen/AMDGPU/setcc64.ll
new file mode 100644
index 00000000000..231be7aa3da
--- /dev/null
+++ b/test/CodeGen/AMDGPU/setcc64.ll
@@ -0,0 +1,259 @@
+;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; XXX: Merge this into setcc, once R600 supports 64-bit operations
+
+;;;==========================================================================;;;
+;; Double comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: {{^}}f64_oeq:
+; SI: v_cmp_eq_f64
+define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp oeq double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_ogt:
+; SI: v_cmp_gt_f64
+define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ogt double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_oge:
+; SI: v_cmp_ge_f64
+define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp oge double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_olt:
+; SI: v_cmp_lt_f64
+define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp olt double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_ole:
+; SI: v_cmp_le_f64
+define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ole double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_one:
+; SI: v_cmp_lg_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp one double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_ord:
+; SI: v_cmp_o_f64
+define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ord double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_ueq:
+; SI: v_cmp_nlg_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ueq double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_ugt:
+
+; SI: v_cmp_nle_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ugt double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_uge:
+; SI: v_cmp_nlt_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp uge double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_ult:
+; SI: v_cmp_nge_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ult double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_ule:
+; SI: v_cmp_ngt_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ule double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_une:
+; SI: v_cmp_neq_f64
+define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp une double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_uno:
+; SI: v_cmp_u_f64
+define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp uno double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+;;;==========================================================================;;;
+;; 64-bit integer comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: {{^}}i64_eq:
+; SI: v_cmp_eq_i64
+define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp eq i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_ne:
+; SI: v_cmp_ne_i64
+define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ne i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_ugt:
+; SI: v_cmp_gt_u64
+define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ugt i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_uge:
+; SI: v_cmp_ge_u64
+define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp uge i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_ult:
+; SI: v_cmp_lt_u64
+define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ult i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_ule:
+; SI: v_cmp_le_u64
+define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ule i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_sgt:
+; SI: v_cmp_gt_i64
+define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp sgt i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_sge:
+; SI: v_cmp_ge_i64
+define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp sge i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_slt:
+; SI: v_cmp_lt_i64
+define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp slt i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_sle:
+; SI: v_cmp_le_i64
+define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp sle i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/seto.ll b/test/CodeGen/AMDGPU/seto.ll
new file mode 100644
index 00000000000..9b5d6b5dbd6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/seto.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
+; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
+define void @main(float %p) {
+main_body:
+  %c = fcmp oeq float %p, %p
+  %r = select i1 %c, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r)
+  ret void
+}
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/setuo.ll b/test/CodeGen/AMDGPU/setuo.ll
new file mode 100644
index 00000000000..76346c4f624
--- /dev/null
+++ b/test/CodeGen/AMDGPU/setuo.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
+; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
+define void @main(float %p) {
+main_body:
+  %c = fcmp une float %p, %p
+  %r = select i1 %c, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r)
+  ret void
+}
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/sext-eliminate.ll b/test/CodeGen/AMDGPU/sext-eliminate.ll
new file mode 100644
index 00000000000..7dc6eb87f6b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sext-eliminate.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_add:
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: SUB_INT {{[* ]*}}[[RES]]
+; EG-NOT: BFE
+define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+  %sext = sext i1 %a to i32
+  %res = add i32 %b, %sext
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_sub:
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT {{[* ]*}}[[RES]]
+; EG-NOT: BFE
+define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+  %sext = sext i1 %a to i32
+  %res = sub i32 %b, %sext
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
new file mode 100644
index 00000000000..5aedda2ce1a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -0,0 +1,611 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_i32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
+; SI: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
+; SI: buffer_store_dword [[EXTRACT]],
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1
+; EG-NEXT: LSHR * [[ADDR]]
+define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
+  %shl = shl i32 %in, 31
+  %sext = ashr i32 %shl, 31
+  store i32 %sext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32:
+; SI: s_add_i32 [[VAL:s[0-9]+]],
+; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: buffer_store_dword [[VEXTRACT]],
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
+; EG-NEXT: LSHR * [[ADDR]]
+define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %shl = shl i32 %c, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i32:
+; SI: s_add_i32 [[VAL:s[0-9]+]],
+; SI: s_sext_i32_i16 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: buffer_store_dword [[VEXTRACT]],
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
+; EG-NEXT: LSHR * [[ADDR]]
+define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %shl = shl i32 %c, 16
+  %ashr = ashr i32 %shl, 16
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i32:
+; SI: s_add_i32 [[VAL:s[0-9]+]],
+; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: buffer_store_dword [[VEXTRACT]],
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
+; EG-NEXT: LSHR * [[ADDR]]
+define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+  %c = add <1 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <1 x i32> %c, <i32 24>
+  %ashr = ashr <1 x i32> %shl, <i32 24>
+  store <1 x i32> %ashr, <1 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x10000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 63
+  %ashr = ashr i64 %shl, 63
+  store i64 %ashr, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x80000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
+; EG: LSHL
+; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
+; EG: ASHR [[RES_HI]]
+; EG-NOT: BFE_INT
+; EG: LSHR
+; EG: LSHR
+;; TODO Check address computation, using | with variables in {{}} does not work,
+;; also the _LO/_HI order might be different
+define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 56
+  %ashr = ashr i64 %shl, 56
+  store i64 %ashr, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x100000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
+; EG: LSHL
+; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
+; EG: ASHR [[RES_HI]]
+; EG-NOT: BFE_INT
+; EG: LSHR
+; EG: LSHR
+;; TODO Check address computation, using | with variables in {{}} does not work,
+;; also the _LO/_HI order might be different
+define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 48
+  %ashr = ashr i64 %shl, 48
+  store i64 %ashr, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i32_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x200000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
+; EG-NOT: BFE_INT
+
+; EG: ASHR [[RES_HI]]
+
+; EG: LSHR
+; EG: LSHR
+;; TODO Check address computation, using | with variables in {{}} does not work,
+;; also the _LO/_HI order might be different
+define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 32
+  %ashr = ashr i64 %shl, 32
+  store i64 %ashr, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
+; XFUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i64:
+; XSI: s_bfe_i32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
+; XSI: s_ashr_i32 {{v[0-9]+}}, [[EXTRACT]], 31
+; XSI: buffer_store_dword
+; XEG: BFE_INT
+; XEG: ASHR
+; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) nounwind {
+;   %c = add <1 x i64> %a, %b
+;   %shl = shl <1 x i64> %c, <i64 56>
+;   %ashr = ashr <1 x i64> %shl, <i64 56>
+;   store <1 x i64> %ashr, <1 x i64> addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1
+; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a.gep, align 8
+  %b = load i64, i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 63
+  %ashr = ashr i64 %shl, 63
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i8_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8
+; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a.gep, align 8
+  %b = load i64, i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 56
+  %ashr = ashr i64 %shl, 56
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i16_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16
+; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a.gep, align 8
+  %b = load i64, i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 48
+  %ashr = ashr i64 %shl, 48
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
+; SI: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[SHR]]{{\]}}
+define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a.gep, align 8
+  %b = load i64, i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 32
+  %ashr = ashr i64 %shl, 32
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_in_i32_other_amount:
+; SI-NOT: s_lshl
+; SI-NOT: s_ashr
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG-NOT: BFE
+; EG: ADD_INT
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b
+  %x = shl i32 %c, 6
+  %y = ashr i32 %x, 7
+  store i32 %y, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount:
+; SI-NOT: s_lshl
+; SI-NOT: s_ashr
+; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
+; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
+; SI: s_endpgm
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG-NOT: BFE
+; EG: ADD_INT
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %c = add <2 x i32> %a, %b
+  %x = shl <2 x i32> %c, <i32 6, i32 6>
+  %y = ashr <2 x i32> %x, <i32 7, i32 7>
+  store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i32:
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: buffer_store_dwordx2
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i32> %c, <i32 31, i32 31>
+  %ashr = ashr <2 x i32> %shl, <i32 31, i32 31>
+  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v4i1_to_v4i32:
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: buffer_store_dwordx4
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+  %c = add <4 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
+  %ashr = ashr <4 x i32> %shl, <i32 31, i32 31, i32 31, i32 31>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i32:
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: buffer_store_dwordx2
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i32> %c, <i32 24, i32 24>
+  %ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
+  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v4i8_to_v4i32:
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: buffer_store_dwordx4
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+  %c = add <4 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
+  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v2i16_to_v2i32:
+; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: buffer_store_dwordx2
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i32> %c, <i32 16, i32 16>
+  %ashr = ashr <2 x i32> %shl, <i32 16, i32 16>
+  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}testcase:
+define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind {
+  %and_a_1 = and i8 %a, 1
+  %cmp_eq = icmp eq i8 %and_a_1, 0
+  %cmp_slt = icmp slt i8 %a, 0
+  %sel0 = select i1 %cmp_slt, i8 0, i8 %a
+  %sel1 = select i1 %cmp_eq, i8 0, i8 %a
+  %xor = xor i8 %sel0, %sel1
+  store i8 %xor, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}testcase_3:
+define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind {
+  %and_a_1 = and i8 %a, 1
+  %cmp_eq = icmp eq i8 %and_a_1, 0
+  %cmp_slt = icmp slt i8 %a, 0
+  %sel0 = select i1 %cmp_slt, i8 0, i8 %a
+  %sel1 = select i1 %cmp_eq, i8 0, i8 %a
+  %xor = xor i8 %sel0, %sel1
+  store i8 %xor, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i8_to_v4i32:
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+  %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
+  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
+  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32:
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+  %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
+  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
+  %ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_to_illegal_type:
+; SI: buffer_load_sbyte
+; SI: v_max_i32
+; SI-NOT: bfe
+; SI: buffer_store_short
+define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
+  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
+  %tmp2 = sext i8 %tmp5 to i32
+  %tmp3 = tail call i32 @llvm.AMDGPU.imax(i32 %tmp2, i32 0) nounwind readnone
+  %tmp4 = trunc i32 %tmp3 to i8
+  %tmp6 = sext i8 %tmp4 to i16
+  store i16 %tmp6, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}bfe_0_width:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_8_bfe_8:
+; SI: v_bfe_i32
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_8_bfe_16:
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; SI: s_endpgm
+define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; This really should be folded into 1
+; FUNC-LABEL: {{^}}bfe_16_bfe_8:
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Make sure there isn't a redundant BFE
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe:
+; SI: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong:
+define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe:
+; SI: buffer_load_sbyte
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
+  %load = load i8, i8 addrspace(1)* %ptr, align 1
+  %sext = sext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI: .text
+; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
+  %load = load i8, i8 addrspace(1)* %ptr, align 1
+  %sext = sext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0:
+; SI-NOT: shr
+; SI-NOT: shl
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; SI: s_endpgm
+define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1:
+; SI: buffer_load_dword
+; SI-NOT: shl
+; SI-NOT: shr
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
+; SI: s_endpgm
+define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 30
+  %shr = ashr i32 %shl, 30
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1:
+; SI: buffer_load_dword
+; SI-NOT: v_lshl
+; SI-NOT: v_ashr
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
+; SI: s_endpgm
+define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 30
+  %shr = ashr i32 %shl, 30
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
new file mode 100644
index 00000000000..38289ced632
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -0,0 +1,105 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+;
+;
+; Most SALU instructions ignore control flow, so we need to make sure
+; they don't overwrite values from other blocks.
+
+; If the branch decision is made based on a value in an SGPR then all
+; threads will execute the same code paths, so we don't need to worry
+; about instructions in different blocks overwriting each other.
+; SI-LABEL: {{^}}sgpr_if_else_salu_br:
+; SI: s_add
+; SI: s_add
+
+define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+entry:
+  %0 = icmp eq i32 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = add i32 %b, %c
+  br label %endif
+
+else:
+  %2 = add i32 %d, %e
+  br label %endif
+
+endif:
+  %3 = phi i32 [%1, %if], [%2, %else]
+  %4 = add i32 %3, %a
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
+
+; The two S_ADD instructions should write to different registers, since
+; different threads will take different control flow paths.
+
+; SI-LABEL: {{^}}sgpr_if_else_valu_br:
+; SI: s_add_i32 [[SGPR:s[0-9]+]]
+; SI-NOT: s_add_i32 [[SGPR]]
+
+define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+entry:
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid_f = uitofp i32 %tid to float
+  %tmp1 = fcmp ueq float %tid_f, 0.0
+  br i1 %tmp1, label %if, label %else
+
+if:
+  %tmp2 = add i32 %b, %c
+  br label %endif
+
+else:
+  %tmp3 = add i32 %d, %e
+  br label %endif
+
+endif:
+  %tmp4 = phi i32 [%tmp2, %if], [%tmp3, %else]
+  store i32 %tmp4, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should write to different SGPR pairs instead of copying to
+; VALU for i1 phi.
+
+; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:
+; SI: buffer_load_dword [[AVAL:v[0-9]+]]
+; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
+; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
+
+; SI: BB2_1:
+; SI: buffer_load_dword [[AVAL:v[0-9]+]]
+; SI: v_cmp_eq_i32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
+; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
+
+; SI: v_cmp_ne_i32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
+; SI: buffer_store_dword [[RESULT]]
+define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+entry:
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp1 = icmp eq i32 %tid, 0
+  br i1 %tmp1, label %if, label %else
+
+if:
+  %gep.if = getelementptr i32, i32 addrspace(1)* %a, i32 %tid
+  %a.val = load i32, i32 addrspace(1)* %gep.if
+  %cmp.if = icmp eq i32 %a.val, 0
+  br label %endif
+
+else:
+  %gep.else = getelementptr i32, i32 addrspace(1)* %b, i32 %tid
+  %b.val = load i32, i32 addrspace(1)* %gep.else
+  %cmp.else = icmp slt i32 %b.val, 0
+  br label %endif
+
+endif:
+  %tmp4 = phi i1 [%cmp.if, %if], [%cmp.else, %else]
+  %ext = sext i1 %tmp4 to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
new file mode 100644
index 00000000000..df67fcca22f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+
+; Copy VGPR -> SGPR used twice as an instruction operand, which is then
+; used in an REG_SEQUENCE that also needs to be handled.
+
+; SI-LABEL: {{^}}test_dup_operands:
+; SI: v_add_i32_e32
+define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %lo = extractelement <2 x i32> %a, i32 0
+  %hi = extractelement <2 x i32> %a, i32 1
+  %add = add i32 %lo, %lo
+  %vec0 = insertelement <2 x i32> undef, i32 %add, i32 0
+  %vec1 = insertelement <2 x i32> %vec0, i32 %hi, i32 1
+  store <2 x i32> %vec1, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll
new file mode 100644
index 00000000000..b849c4038bc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -0,0 +1,379 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; This test checks that no VGPR to SGPR copies are created by the register
+; allocator.
+; CHECK-LABEL: {{^}}phi1:
+; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
+; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]]
+
+define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
+  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
+  %25 = fptosi float %23 to i32
+  %26 = icmp ne i32 %25, 0
+  br i1 %26, label %ENDIF, label %ELSE
+
+ELSE:                                             ; preds = %main_body
+  %27 = fsub float -0.000000e+00, %22
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %main_body, %ELSE
+  %temp.0 = phi float [ %27, %ELSE ], [ %22, %main_body ]
+  %28 = fadd float %temp.0, %24
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %28, float %28, float 0.000000e+00, float 1.000000e+00)
+  ret void
+}
+
+; Make sure this program doesn't crash
+; CHECK-LABEL: {{^}}phi2:
+define void @phi2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 36)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 40)
+  %26 = call float @llvm.SI.load.const(<16 x i8> %21, i32 48)
+  %27 = call float @llvm.SI.load.const(<16 x i8> %21, i32 52)
+  %28 = call float @llvm.SI.load.const(<16 x i8> %21, i32 56)
+  %29 = call float @llvm.SI.load.const(<16 x i8> %21, i32 64)
+  %30 = call float @llvm.SI.load.const(<16 x i8> %21, i32 68)
+  %31 = call float @llvm.SI.load.const(<16 x i8> %21, i32 72)
+  %32 = call float @llvm.SI.load.const(<16 x i8> %21, i32 76)
+  %33 = call float @llvm.SI.load.const(<16 x i8> %21, i32 80)
+  %34 = call float @llvm.SI.load.const(<16 x i8> %21, i32 84)
+  %35 = call float @llvm.SI.load.const(<16 x i8> %21, i32 88)
+  %36 = call float @llvm.SI.load.const(<16 x i8> %21, i32 92)
+  %37 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
+  %38 = load <32 x i8>, <32 x i8> addrspace(2)* %37, !tbaa !1
+  %39 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0
+  %40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, !tbaa !1
+  %41 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
+  %42 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
+  %43 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %3, <2 x i32> %5)
+  %44 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %3, <2 x i32> %5)
+  %45 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %3, <2 x i32> %5)
+  %46 = bitcast float %41 to i32
+  %47 = bitcast float %42 to i32
+  %48 = insertelement <2 x i32> undef, i32 %46, i32 0
+  %49 = insertelement <2 x i32> %48, i32 %47, i32 1
+  %50 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %49, <32 x i8> %38, <16 x i8> %40, i32 2)
+  %51 = extractelement <4 x float> %50, i32 2
+  %52 = call float @fabs(float %51)
+  %53 = fmul float %43, %43
+  %54 = fmul float %44, %44
+  %55 = fadd float %54, %53
+  %56 = fmul float %45, %45
+  %57 = fadd float %55, %56
+  %58 = call float @llvm.AMDGPU.rsq.f32(float %57)
+  %59 = fmul float %43, %58
+  %60 = fmul float %44, %58
+  %61 = fmul float %45, %58
+  %62 = fmul float %59, %23
+  %63 = fmul float %60, %24
+  %64 = fadd float %63, %62
+  %65 = fmul float %61, %25
+  %66 = fadd float %64, %65
+  %67 = fsub float -0.000000e+00, %26
+  %68 = fmul float %66, %52
+  %69 = fadd float %68, %67
+  %70 = fmul float %27, %69
+  %71 = fmul float %28, %69
+  %72 = call float @fabs(float %70)
+  %73 = fcmp olt float 0x3EE4F8B580000000, %72
+  %74 = sext i1 %73 to i32
+  %75 = bitcast i32 %74 to float
+  %76 = bitcast float %75 to i32
+  %77 = icmp ne i32 %76, 0
+  br i1 %77, label %IF, label %ENDIF
+
+IF:                                               ; preds = %main_body
+  %78 = fsub float -0.000000e+00, %70
+  %79 = call float @llvm.AMDIL.exp.(float %78)
+  %80 = fsub float -0.000000e+00, %79
+  %81 = fadd float 1.000000e+00, %80
+  %82 = fdiv float 1.000000e+00, %70
+  %83 = fmul float %81, %82
+  %84 = fmul float %32, %83
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %main_body, %IF
+  %temp4.0 = phi float [ %84, %IF ], [ %32, %main_body ]
+  %85 = call float @fabs(float %71)
+  %86 = fcmp olt float 0x3EE4F8B580000000, %85
+  %87 = sext i1 %86 to i32
+  %88 = bitcast i32 %87 to float
+  %89 = bitcast float %88 to i32
+  %90 = icmp ne i32 %89, 0
+  br i1 %90, label %IF25, label %ENDIF24
+
+IF25:                                             ; preds = %ENDIF
+  %91 = fsub float -0.000000e+00, %71
+  %92 = call float @llvm.AMDIL.exp.(float %91)
+  %93 = fsub float -0.000000e+00, %92
+  %94 = fadd float 1.000000e+00, %93
+  %95 = fdiv float 1.000000e+00, %71
+  %96 = fmul float %94, %95
+  %97 = fmul float %36, %96
+  br label %ENDIF24
+
+ENDIF24:                                          ; preds = %ENDIF, %IF25
+  %temp8.0 = phi float [ %97, %IF25 ], [ %36, %ENDIF ]
+  %98 = fmul float %29, %temp4.0
+  %99 = fmul float %30, %temp4.0
+  %100 = fmul float %31, %temp4.0
+  %101 = fmul float %33, %temp8.0
+  %102 = fadd float %101, %98
+  %103 = fmul float %34, %temp8.0
+  %104 = fadd float %103, %99
+  %105 = fmul float %35, %temp8.0
+  %106 = fadd float %105, %100
+  %107 = call float @llvm.pow.f32(float %52, float %22)
+  %108 = fsub float -0.000000e+00, %102
+  %109 = fmul float %108, %107
+  %110 = fsub float -0.000000e+00, %104
+  %111 = fmul float %110, %107
+  %112 = fsub float -0.000000e+00, %106
+  %113 = fmul float %112, %107
+  %114 = call i32 @llvm.SI.packf16(float %109, float %111)
+  %115 = bitcast i32 %114 to float
+  %116 = call i32 @llvm.SI.packf16(float %113, float 1.000000e+00)
+  %117 = bitcast i32 %116 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %115, float %117, float %115, float %117)
+  ret void
+}
+
+; We just want ot make sure the program doesn't crash
+; CHECK-LABEL: {{^}}loop:
+
+define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
+  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 4)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 8)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 12)
+  %26 = fptosi float %25 to i32
+  %27 = bitcast i32 %26 to float
+  %28 = bitcast float %27 to i32
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF, %main_body
+  %temp4.0 = phi float [ %22, %main_body ], [ %temp5.0, %ENDIF ]
+  %temp5.0 = phi float [ %23, %main_body ], [ %temp6.0, %ENDIF ]
+  %temp6.0 = phi float [ %24, %main_body ], [ %temp4.0, %ENDIF ]
+  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %37, %ENDIF ]
+  %29 = bitcast float %temp8.0 to i32
+  %30 = icmp sge i32 %29, %28
+  %31 = sext i1 %30 to i32
+  %32 = bitcast i32 %31 to float
+  %33 = bitcast float %32 to i32
+  %34 = icmp ne i32 %33, 0
+  br i1 %34, label %IF, label %ENDIF
+
+IF:                                               ; preds = %LOOP
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00)
+  ret void
+
+ENDIF:                                            ; preds = %LOOP
+  %35 = bitcast float %temp8.0 to i32
+  %36 = add i32 %35, 1
+  %37 = bitcast i32 %36 to float
+  br label %LOOP
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: readonly
+declare float @fabs(float) #2
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readonly }
+attributes #3 = { readnone }
+attributes #4 = { nounwind readonly }
+
+!0 = !{!"const", null}
+!1 = !{!0, !0, i64 0, i32 1}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq.f32(float) #3
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.exp.(float) #3
+
+; Function Attrs: nounwind readonly
+declare float @llvm.pow.f32(float, float) #4
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+; This checks for a bug in the FixSGPRCopies pass where VReg96
+; registers were being identified as an SGPR regclass which was causing
+; an assertion failure.
+
+; CHECK-LABEL: {{^}}sample_v3:
+; CHECK: image_sample
+; CHECK: image_sample
+; CHECK: exp
+; CHECK: s_endpgm
+define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+
+entry:
+  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
+  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !2
+  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 16)
+  %24 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
+  %25 = load <32 x i8>, <32 x i8> addrspace(2)* %24, !tbaa !2
+  %26 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
+  %27 = load <16 x i8>, <16 x i8> addrspace(2)* %26, !tbaa !2
+  %28 = fcmp oeq float %23, 0.0
+  br i1 %28, label %if, label %else
+
+if:
+  %val.if = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> <i32 0, i32 0>, <32 x i8> %25, <16 x i8> %27, i32 2)
+  %val.if.0 = extractelement <4 x float> %val.if, i32 0
+  %val.if.1 = extractelement <4 x float> %val.if, i32 1
+  %val.if.2 = extractelement <4 x float> %val.if, i32 2
+  br label %endif
+
+else:
+  %val.else = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> <i32 1, i32 0>, <32 x i8> %25, <16 x i8> %27, i32 2)
+  %val.else.0 = extractelement <4 x float> %val.else, i32 0
+  %val.else.1 = extractelement <4 x float> %val.else, i32 1
+  %val.else.2 = extractelement <4 x float> %val.else, i32 2
+  br label %endif
+
+endif:
+  %val.0 = phi float [%val.if.0, %if], [%val.else.0, %else]
+  %val.1 = phi float [%val.if.1, %if], [%val.else.1, %else]
+  %val.2 = phi float [%val.if.2, %if], [%val.else.2, %else]
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.0)
+  ret void
+}
+
+!2 = !{!"const", null, i32 1}
+
+; CHECK-LABEL: {{^}}copy1:
+; CHECK: buffer_load_dword
+; CHECK: v_add
+; CHECK: s_endpgm
+define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
+entry:
+  %0 = load float, float addrspace(1)* %in0
+  %1 = fcmp oeq float %0, 0.0
+  br i1 %1, label %if0, label %endif
+
+if0:
+  %2 = bitcast float %0 to i32
+  %3 = fcmp olt float %0, 0.0
+  br i1 %3, label %if1, label %endif
+
+if1:
+  %4 = add i32 %2, 1
+  br label %endif
+
+endif:
+  %5 = phi i32 [ 0, %entry ], [ %2, %if0 ], [ %4, %if1 ]
+  %6 = bitcast i32 %5 to float
+  store float %6, float addrspace(1)* %out
+  ret void
+}
+
+; This test is just checking that we don't crash / assertion fail.
+; CHECK-LABEL: {{^}}copy2:
+; CHECK: s_endpgm
+
+define void @copy2([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+entry:
+  br label %LOOP68
+
+LOOP68:
+  %temp4.7 = phi float [ 0.000000e+00, %entry ], [ %v, %ENDIF69 ]
+  %t = phi i32 [ 20, %entry ], [ %x, %ENDIF69 ]
+  %g = icmp eq i32 0, %t
+  %l = bitcast float %temp4.7 to i32
+  br i1 %g, label %IF70, label %ENDIF69
+
+IF70:
+  %q = icmp ne i32 %l, 13
+  %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  ret void
+
+ENDIF69:
+  %u = add i32 %l, %t
+  %v = bitcast i32 %u to float
+  %x = add i32 %t, -1
+  br label %LOOP68
+}
+
+attributes #0 = { "ShaderType"="0" }
+
+; This test checks that image_sample resource descriptors aren't loaded into
+; vgprs.  The verifier will fail if this happens.
+; CHECK-LABEL:{{^}}sample_rsrc:
+; CHECK: image_sample
+; CHECK: image_sample
+; CHECK: s_endpgm
+define void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
+bb:
+  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0
+  %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16)
+  %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0
+  %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !0
+  %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0
+  %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !0
+  %tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7)
+  %tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7)
+  %tmp31 = bitcast float %tmp23 to i32
+  %tmp36 = icmp ne i32 %tmp31, 0
+  br i1 %tmp36, label %bb38, label %bb80
+
+bb38:                                             ; preds = %bb
+  %tmp52 = bitcast float %tmp29 to i32
+  %tmp53 = bitcast float %tmp30 to i32
+  %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0
+  %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1
+  %tmp56 = bitcast <8 x i32> %tmp26 to <32 x i8>
+  %tmp57 = bitcast <4 x i32> %tmp28 to <16 x i8>
+  %tmp58 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp55, <32 x i8> %tmp56, <16 x i8> %tmp57, i32 2)
+  br label %bb71
+
+bb80:                                             ; preds = %bb
+  %tmp81 = bitcast float %tmp29 to i32
+  %tmp82 = bitcast float %tmp30 to i32
+  %tmp82.2 = add i32 %tmp82, 1
+  %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0
+  %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1
+  %tmp85 = bitcast <8 x i32> %tmp26 to <32 x i8>
+  %tmp86 = bitcast <4 x i32> %tmp28 to <16 x i8>
+  %tmp87 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp84, <32 x i8> %tmp85, <16 x i8> %tmp86, i32 2)
+  br label %bb71
+
+bb71:                                             ; preds = %bb80, %bb38
+  %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ]
+  %tmp88 = extractelement <4 x float> %tmp72, i32 0
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp88, float %tmp88, float %tmp88, float %tmp88)
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/shared-op-cycle.ll b/test/CodeGen/AMDGPU/shared-op-cycle.ll
new file mode 100644
index 00000000000..f52a9baf4d1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shared-op-cycle.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: {{^}}main:
+; CHECK: MULADD_IEEE *
+; CHECK-NOT: MULADD_IEEE *
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+   %w0 = extractelement <4 x float> %reg0, i32 3
+   %w1 = extractelement <4 x float> %reg1, i32 3
+   %w2 = extractelement <4 x float> %reg2, i32 3
+   %sq0 = fmul float %w0, %w0
+   %r0 = fadd float %sq0, 2.0
+   %sq1 = fmul float %w1, %w1
+   %r1 = fadd float %sq1, 2.0
+   %sq2 = fmul float %w2, %w2
+   %r2 = fadd float %sq2, 2.0
+   %v0 = insertelement <4 x float> undef, float %r0, i32 0
+   %v1 = insertelement <4 x float> %v0, float %r1, i32 1
+   %v2 = insertelement <4 x float> %v1, float %r2, i32 2
+   %res = call float @llvm.AMDGPU.dp4(<4 x float> %v2, <4 x float> %v2)
+   %vecres = insertelement <4 x float> undef, float %res, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vecres, i32 0, i32 2)
+   ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
new file mode 100644
index 00000000000..53b63dc4b8a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -0,0 +1,180 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s
+
+;EG: {{^}}shl_v2i32:
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: {{^}}shl_v2i32:
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+;VI: {{^}}shl_v2i32:
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = shl <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG: {{^}}shl_v4i32:
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: {{^}}shl_v4i32:
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+;VI: {{^}}shl_v4i32:
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = shl <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG: {{^}}shl_i64:
+;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]]
+;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}}
+;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
+
+;SI: {{^}}shl_i64:
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI: {{^}}shl_i64:
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %a = load i64, i64 addrspace(1) * %in
+  %b = load i64, i64 addrspace(1) * %b_ptr
+  %result = shl i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;EG: {{^}}shl_v2i64:
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHA]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHB]]
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: LSHL {{.*}}, [[SHA]]
+;EG-DAG: LSHL {{.*}}, [[SHB]]
+;EG-DAG: LSHL {{.*}}, [[SHA]]
+;EG-DAG: LSHL {{.*}}, [[SHB]]
+;EG-DAG: LSHL
+;EG-DAG: LSHL
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+
+;SI: {{^}}shl_v2i64:
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI: {{^}}shl_v2i64:
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64>, <2 x i64> addrspace(1) * %in
+  %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr
+  %result = shl <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+;EG: {{^}}shl_v4i64:
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHA]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHB]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHC]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHD]]
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: LSHL {{.*}}, [[SHA]]
+;EG-DAG: LSHL {{.*}}, [[SHB]]
+;EG-DAG: LSHL {{.*}}, [[SHC]]
+;EG-DAG: LSHL {{.*}}, [[SHD]]
+;EG-DAG: LSHL {{.*}}, [[SHA]]
+;EG-DAG: LSHL {{.*}}, [[SHB]]
+;EG-DAG: LSHL {{.*}}, [[SHC]]
+;EG-DAG: LSHL {{.*}}, [[SHD]]
+;EG-DAG: LSHL
+;EG-DAG: LSHL
+;EG-DAG: LSHL
+;EG-DAG: LSHL
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+
+;SI: {{^}}shl_v4i64:
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI: {{^}}shl_v4i64:
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64>, <4 x i64> addrspace(1) * %in
+  %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr
+  %result = shl <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/shl_add_constant.ll b/test/CodeGen/AMDGPU/shl_add_constant.ll
new file mode 100644
index 00000000000..b1485bfaaeb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shl_add_constant.ll
@@ -0,0 +1,90 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Test with inline immediate
+
+; FUNC-LABEL: {{^}}shl_2_add_9_i32:
+; SI: v_lshlrev_b32_e32  [[REG:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 36, [[REG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  %add = add i32 %val, 9
+  %result = shl i32 %add, 2
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}shl_2_add_9_i32_2_add_uses:
+; SI-DAG: v_add_i32_e32 [[ADDREG:v[0-9]+]], 9, {{v[0-9]+}}
+; SI-DAG: v_lshlrev_b32_e32 [[SHLREG:v[0-9]+]], 2, {{v[0-9]+}}
+; SI-DAG: buffer_store_dword [[ADDREG]]
+; SI-DAG: buffer_store_dword [[SHLREG]]
+; SI: s_endpgm
+define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  %add = add i32 %val, 9
+  %result = shl i32 %add, 2
+  store i32 %result, i32 addrspace(1)* %out0, align 4
+  store i32 %add, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; Test with add literal constant
+
+; FUNC-LABEL: {{^}}shl_2_add_999_i32:
+; SI: v_lshlrev_b32_e32  [[REG:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 0xf9c, [[REG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  %shl = add i32 %val, 999
+  %result = shl i32 %shl, 2
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_add_shl_add_constant:
+; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
+; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]]
+; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
+; SI: buffer_store_dword [[VRESULT]]
+define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+  %add.0 = add i32 %x, 123
+  %shl = shl i32 %add.0, 3
+  %add.1 = add i32 %shl, %y
+   store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_add_shl_add_constant_inv:
+; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
+; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]]
+; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
+; SI: buffer_store_dword [[VRESULT]]
+
+define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+  %add.0 = add i32 %x, 123
+  %shl = shl i32 %add.0, 3
+  %add.1 = add i32 %y, %shl
+  store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/shl_add_ptr.ll b/test/CodeGen/AMDGPU/shl_add_ptr.ll
new file mode 100644
index 00000000000..6671e909cd1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -0,0 +1,284 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+
+; Test that doing a shift of a pointer with a constant add will be
+; folded into the constant offset addressing mode even if the add has
+; multiple uses. This is relevant to accessing 2 separate, adjacent
+; LDS globals.
+
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+@lds0 = addrspace(3) global [512 x float] undef, align 4
+@lds1 = addrspace(3) global [512 x float] undef, align 4
+
+
+; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8
+
+; SI-LABEL: {{^}}load_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
+; SI: s_endpgm
+define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  store float %val0, float addrspace(1)* %out
+  ret void
+}
+
+; Make sure once the first use is folded into the addressing mode, the
+; remaining add use goes through the normal shl + add constant fold.
+
+; SI-LABEL: {{^}}load_shl_base_lds_1:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
+; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], 8, v{{[0-9]+}}
+; SI-DAG: buffer_store_dword [[RESULT]]
+; SI-DAG: buffer_store_dword [[ADDUSE]]
+; SI: s_endpgm
+define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %shl_add_use = shl i32 %idx.0, 2
+  store i32 %shl_add_use, i32 addrspace(1)* %add_use, align 4
+  store float %val0, float addrspace(1)* %out
+  ret void
+}
+
+@maxlds = addrspace(3) global [65536 x i8] undef, align 4
+
+; SI-LABEL: {{^}}load_shl_base_lds_max_offset
+; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
+; SI: s_endpgm
+define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 65535
+  %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
+  %val0 = load i8, i8 addrspace(3)* %arrayidx0
+  store i32 %idx.0, i32 addrspace(1)* %add_use
+  store i8 %val0, i8 addrspace(1)* %out
+  ret void
+}
+
+; The two globals are placed adjacent in memory, so the same base
+; pointer can be used with an offset into the second one.
+
+; SI-LABEL: {{^}}load_shl_base_lds_2:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
+; SI: s_endpgm
+define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 64
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  store float %sum, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}store_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  store float 1.0, float addrspace(3)* %arrayidx0, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+
+; --------------------------------------------------------------------------------
+; Atomics.
+
+@lds2 = addrspace(3) global [512 x i32] undef, align 4
+
+; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+;   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+;   %idx.0 = add nsw i32 %tid.x, 2
+;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+;   %val = load atomic i32, i32 addrspace(3)* %arrayidx0 seq_cst, align 4
+;   store i32 %val, i32 addrspace(1)* %out, align 4
+;   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+;   ret void
+; }
+
+
+; SI-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_swap_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_add_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_sub_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_and_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_or_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_xor_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+;   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+;   %idx.0 = add nsw i32 %tid.x, 2
+;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+;   %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+;   store i32 %val, i32 addrspace(1)* %out, align 4
+;   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+;   ret void
+; }
+
+; SI-LABEL: {{^}}atomic_min_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_max_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_umin_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_umax_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll b/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll
new file mode 100644
index 00000000000..69d719385ac
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll
@@ -0,0 +1,25 @@
+; REQUIRES: asserts
+; XFAIL: *
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
+
+
+define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+; CHECK-LABEL: {{^}}test:
+
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.bb
+    i32 60, label %sw.bb
+  ]
+
+sw.bb:
+  unreachable
+
+sw.default:
+  unreachable
+
+sw.epilog:
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf.ll b/test/CodeGen/AMDGPU/si-annotate-cf.ll
new file mode 100644
index 00000000000..bbcb861f37d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:
+
+; SI: [[LOOP_LABEL:[A-Z0-9]+]]:
+; Lowered break instructin:
+; SI: s_or_b64
+; Lowered Loop instruction:
+; SI: s_andn2_b64
+; s_cbranch_execnz [[LOOP_LABEL]]
+; SI: s_endpgm
+define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+main_body:
+  %0 = and i32 %a, %b
+  %1 = trunc i32 %0 to i1
+  br label %ENDIF
+
+ENDLOOP:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+
+ENDIF:
+  br i1 %1, label %ENDLOOP, label %ENDIF
+}
+
+
+; FUNC-LABEL: {{^}}phi_cond_outside_loop:
+; FIXME: This could be folded into the s_or_b64 instruction
+; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0
+; SI: [[LOOP_LABEL:[A-Z0-9]+]]
+; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
+
+; SI_IF_BREAK instruction:
+; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]]
+
+; SI_LOOP instruction:
+; SI: s_andn2_b64 exec, exec, [[BREAK]]
+; SI: s_cbranch_execnz [[LOOP_LABEL]]
+; SI: s_endpgm
+
+define void @phi_cond_outside_loop(i32 %a, i32 %b) {
+entry:
+  %0 = icmp eq i32 %a , 0
+  br i1 %0, label %if, label %else
+
+if:
+  br label %endif
+
+else:
+  %1 = icmp eq i32 %b, 0
+  br label %endif
+
+endif:
+  %2 = phi i1 [0, %if], [%1, %else]
+  br label %loop
+
+loop:
+  br i1 %2, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/si-lod-bias.ll b/test/CodeGen/AMDGPU/si-lod-bias.ll
new file mode 100644
index 00000000000..944499a1146
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-lod-bias.ll
@@ -0,0 +1,52 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; This shader has the potential to generated illegal VGPR to SGPR copies if
+; the wrong register class is used for the REG_SEQUENCE instructions.
+
+; CHECK: {{^}}main:
+; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}}
+
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  %23 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
+  %24 = load <32 x i8>, <32 x i8> addrspace(2)* %23, !tbaa !1
+  %25 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0
+  %26 = load <16 x i8>, <16 x i8> addrspace(2)* %25, !tbaa !1
+  %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
+  %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
+  %29 = bitcast float %22 to i32
+  %30 = bitcast float %27 to i32
+  %31 = bitcast float %28 to i32
+  %32 = insertelement <4 x i32> undef, i32 %29, i32 0
+  %33 = insertelement <4 x i32> %32, i32 %30, i32 1
+  %34 = insertelement <4 x i32> %33, i32 %31, i32 2
+  %35 = insertelement <4 x i32> %34, i32 undef, i32 3
+  %36 = call <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32> %35, <32 x i8> %24, <16 x i8> %26, i32 2)
+  %37 = extractelement <4 x float> %36, i32 0
+  %38 = extractelement <4 x float> %36, i32 1
+  %39 = extractelement <4 x float> %36, i32 2
+  %40 = extractelement <4 x float> %36, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %37, float %38, float %39, float %40)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null}
+!1 = !{!0, !0, i64 0, i32 1}
diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
new file mode 100644
index 00000000000..84652701f77
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -0,0 +1,1568 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s
+
+; These tests check that the compiler won't crash when it needs to spill
+; SGPRs.
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: s_wqm
+; Writing to M0 from an SMRD instruction will hang the GPU.
+; CHECK-NOT: s_buffer_load_dword m0
+; CHECK: s_endpgm
+@ddxy_lds = external addrspace(3) global [64 x i32]
+
+define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
+  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0
+  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 96)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 100)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 104)
+  %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 112)
+  %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 116)
+  %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 120)
+  %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128)
+  %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132)
+  %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 140)
+  %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144)
+  %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160)
+  %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176)
+  %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180)
+  %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184)
+  %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192)
+  %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196)
+  %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200)
+  %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208)
+  %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212)
+  %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216)
+  %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 224)
+  %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240)
+  %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244)
+  %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248)
+  %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256)
+  %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272)
+  %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276)
+  %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280)
+  %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288)
+  %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292)
+  %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 296)
+  %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 304)
+  %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 308)
+  %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 312)
+  %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 368)
+  %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 372)
+  %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 376)
+  %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 384)
+  %61 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
+  %62 = load <32 x i8>, <32 x i8> addrspace(2)* %61, !tbaa !0
+  %63 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
+  %64 = load <16 x i8>, <16 x i8> addrspace(2)* %63, !tbaa !0
+  %65 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
+  %66 = load <32 x i8>, <32 x i8> addrspace(2)* %65, !tbaa !0
+  %67 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
+  %68 = load <16 x i8>, <16 x i8> addrspace(2)* %67, !tbaa !0
+  %69 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
+  %70 = load <32 x i8>, <32 x i8> addrspace(2)* %69, !tbaa !0
+  %71 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
+  %72 = load <16 x i8>, <16 x i8> addrspace(2)* %71, !tbaa !0
+  %73 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
+  %74 = load <32 x i8>, <32 x i8> addrspace(2)* %73, !tbaa !0
+  %75 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
+  %76 = load <16 x i8>, <16 x i8> addrspace(2)* %75, !tbaa !0
+  %77 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
+  %78 = load <32 x i8>, <32 x i8> addrspace(2)* %77, !tbaa !0
+  %79 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
+  %80 = load <16 x i8>, <16 x i8> addrspace(2)* %79, !tbaa !0
+  %81 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
+  %82 = load <32 x i8>, <32 x i8> addrspace(2)* %81, !tbaa !0
+  %83 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
+  %84 = load <16 x i8>, <16 x i8> addrspace(2)* %83, !tbaa !0
+  %85 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
+  %86 = load <32 x i8>, <32 x i8> addrspace(2)* %85, !tbaa !0
+  %87 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
+  %88 = load <16 x i8>, <16 x i8> addrspace(2)* %87, !tbaa !0
+  %89 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
+  %90 = load <32 x i8>, <32 x i8> addrspace(2)* %89, !tbaa !0
+  %91 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
+  %92 = load <16 x i8>, <16 x i8> addrspace(2)* %91, !tbaa !0
+  %93 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
+  %94 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6)
+  %95 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6)
+  %96 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6)
+  %97 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6)
+  %98 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6)
+  %99 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6)
+  %100 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6)
+  %101 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6)
+  %102 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6)
+  %103 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6)
+  %104 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6)
+  %105 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6)
+  %106 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6)
+  %107 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6)
+  %108 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6)
+  %109 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6)
+  %110 = call i32 @llvm.SI.tid()
+  %111 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %110
+  %112 = bitcast float %93 to i32
+  store i32 %112, i32 addrspace(3)* %111
+  %113 = bitcast float %94 to i32
+  store i32 %113, i32 addrspace(3)* %111
+  %114 = call i32 @llvm.SI.tid()
+  %115 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %114
+  %116 = and i32 %114, -4
+  %117 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %116
+  %118 = add i32 %116, 1
+  %119 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %118
+  %120 = bitcast float %93 to i32
+  store i32 %120, i32 addrspace(3)* %115
+  %121 = load i32, i32 addrspace(3)* %117
+  %122 = bitcast i32 %121 to float
+  %123 = load i32, i32 addrspace(3)* %119
+  %124 = bitcast i32 %123 to float
+  %125 = fsub float %124, %122
+  %126 = bitcast float %94 to i32
+  store i32 %126, i32 addrspace(3)* %115
+  %127 = load i32, i32 addrspace(3)* %117
+  %128 = bitcast i32 %127 to float
+  %129 = load i32, i32 addrspace(3)* %119
+  %130 = bitcast i32 %129 to float
+  %131 = fsub float %130, %128
+  %132 = insertelement <4 x float> undef, float %125, i32 0
+  %133 = insertelement <4 x float> %132, float %131, i32 1
+  %134 = insertelement <4 x float> %133, float %131, i32 2
+  %135 = insertelement <4 x float> %134, float %131, i32 3
+  %136 = extractelement <4 x float> %135, i32 0
+  %137 = extractelement <4 x float> %135, i32 1
+  %138 = fmul float %60, %93
+  %139 = fmul float %60, %94
+  %140 = fmul float %60, %94
+  %141 = fmul float %60, %94
+  %142 = call i32 @llvm.SI.tid()
+  %143 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %142
+  %144 = bitcast float %138 to i32
+  store i32 %144, i32 addrspace(3)* %143
+  %145 = bitcast float %139 to i32
+  store i32 %145, i32 addrspace(3)* %143
+  %146 = bitcast float %140 to i32
+  store i32 %146, i32 addrspace(3)* %143
+  %147 = bitcast float %141 to i32
+  store i32 %147, i32 addrspace(3)* %143
+  %148 = call i32 @llvm.SI.tid()
+  %149 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %148
+  %150 = and i32 %148, -4
+  %151 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %150
+  %152 = add i32 %150, 2
+  %153 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %152
+  %154 = bitcast float %138 to i32
+  store i32 %154, i32 addrspace(3)* %149
+  %155 = load i32, i32 addrspace(3)* %151
+  %156 = bitcast i32 %155 to float
+  %157 = load i32, i32 addrspace(3)* %153
+  %158 = bitcast i32 %157 to float
+  %159 = fsub float %158, %156
+  %160 = bitcast float %139 to i32
+  store i32 %160, i32 addrspace(3)* %149
+  %161 = load i32, i32 addrspace(3)* %151
+  %162 = bitcast i32 %161 to float
+  %163 = load i32, i32 addrspace(3)* %153
+  %164 = bitcast i32 %163 to float
+  %165 = fsub float %164, %162
+  %166 = bitcast float %140 to i32
+  store i32 %166, i32 addrspace(3)* %149
+  %167 = load i32, i32 addrspace(3)* %151
+  %168 = bitcast i32 %167 to float
+  %169 = load i32, i32 addrspace(3)* %153
+  %170 = bitcast i32 %169 to float
+  %171 = fsub float %170, %168
+  %172 = bitcast float %141 to i32
+  store i32 %172, i32 addrspace(3)* %149
+  %173 = load i32, i32 addrspace(3)* %151
+  %174 = bitcast i32 %173 to float
+  %175 = load i32, i32 addrspace(3)* %153
+  %176 = bitcast i32 %175 to float
+  %177 = fsub float %176, %174
+  %178 = insertelement <4 x float> undef, float %159, i32 0
+  %179 = insertelement <4 x float> %178, float %165, i32 1
+  %180 = insertelement <4 x float> %179, float %171, i32 2
+  %181 = insertelement <4 x float> %180, float %177, i32 3
+  %182 = extractelement <4 x float> %181, i32 0
+  %183 = extractelement <4 x float> %181, i32 1
+  %184 = fdiv float 1.000000e+00, %97
+  %185 = fmul float %33, %184
+  %186 = fcmp uge float 1.000000e+00, %185
+  %187 = select i1 %186, float %185, float 1.000000e+00
+  %188 = fmul float %187, %30
+  %189 = call float @ceil(float %188)
+  %190 = fcmp uge float 3.000000e+00, %189
+  %191 = select i1 %190, float 3.000000e+00, float %189
+  %192 = fdiv float 1.000000e+00, %191
+  %193 = fdiv float 1.000000e+00, %30
+  %194 = fmul float %191, %193
+  %195 = fmul float %31, %194
+  %196 = fmul float %95, %95
+  %197 = fmul float %96, %96
+  %198 = fadd float %197, %196
+  %199 = fmul float %97, %97
+  %200 = fadd float %198, %199
+  %201 = call float @llvm.AMDGPU.rsq.f32(float %200)
+  %202 = fmul float %95, %201
+  %203 = fmul float %96, %201
+  %204 = fmul float %202, %29
+  %205 = fmul float %203, %29
+  %206 = fmul float %204, -1.000000e+00
+  %207 = fmul float %205, 1.000000e+00
+  %208 = fmul float %206, %32
+  %209 = fmul float %207, %32
+  %210 = fsub float -0.000000e+00, %208
+  %211 = fadd float %93, %210
+  %212 = fsub float -0.000000e+00, %209
+  %213 = fadd float %94, %212
+  %214 = fmul float %206, %192
+  %215 = fmul float %207, %192
+  %216 = fmul float -1.000000e+00, %192
+  %217 = bitcast float %136 to i32
+  %218 = bitcast float %182 to i32
+  %219 = bitcast float %137 to i32
+  %220 = bitcast float %183 to i32
+  %221 = insertelement <8 x i32> undef, i32 %217, i32 0
+  %222 = insertelement <8 x i32> %221, i32 %218, i32 1
+  %223 = insertelement <8 x i32> %222, i32 %219, i32 2
+  %224 = insertelement <8 x i32> %223, i32 %220, i32 3
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF, %main_body
+  %temp24.0 = phi float [ 1.000000e+00, %main_body ], [ %258, %ENDIF ]
+  %temp28.0 = phi float [ %211, %main_body ], [ %253, %ENDIF ]
+  %temp29.0 = phi float [ %213, %main_body ], [ %255, %ENDIF ]
+  %temp30.0 = phi float [ 1.000000e+00, %main_body ], [ %257, %ENDIF ]
+  %225 = fcmp oge float %temp24.0, %191
+  %226 = sext i1 %225 to i32
+  %227 = bitcast i32 %226 to float
+  %228 = bitcast float %227 to i32
+  %229 = icmp ne i32 %228, 0
+  br i1 %229, label %IF, label %ENDIF
+
+IF:                                               ; preds = %LOOP
+  %230 = bitcast float %136 to i32
+  %231 = bitcast float %182 to i32
+  %232 = bitcast float %137 to i32
+  %233 = bitcast float %183 to i32
+  %234 = insertelement <8 x i32> undef, i32 %230, i32 0
+  %235 = insertelement <8 x i32> %234, i32 %231, i32 1
+  %236 = insertelement <8 x i32> %235, i32 %232, i32 2
+  %237 = insertelement <8 x i32> %236, i32 %233, i32 3
+  br label %LOOP65
+
+ENDIF:                                            ; preds = %LOOP
+  %238 = bitcast float %temp28.0 to i32
+  %239 = bitcast float %temp29.0 to i32
+  %240 = insertelement <8 x i32> %224, i32 %238, i32 4
+  %241 = insertelement <8 x i32> %240, i32 %239, i32 5
+  %242 = insertelement <8 x i32> %241, i32 undef, i32 6
+  %243 = insertelement <8 x i32> %242, i32 undef, i32 7
+  %244 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %243, <32 x i8> %62, <16 x i8> %64, i32 2)
+  %245 = extractelement <4 x float> %244, i32 3
+  %246 = fcmp oge float %temp30.0, %245
+  %247 = sext i1 %246 to i32
+  %248 = bitcast i32 %247 to float
+  %249 = bitcast float %248 to i32
+  %250 = and i32 %249, 1065353216
+  %251 = bitcast i32 %250 to float
+  %252 = fmul float %214, %251
+  %253 = fadd float %252, %temp28.0
+  %254 = fmul float %215, %251
+  %255 = fadd float %254, %temp29.0
+  %256 = fmul float %216, %251
+  %257 = fadd float %256, %temp30.0
+  %258 = fadd float %temp24.0, 1.000000e+00
+  br label %LOOP
+
+LOOP65:                                           ; preds = %ENDIF66, %IF
+  %temp24.1 = phi float [ 0.000000e+00, %IF ], [ %610, %ENDIF66 ]
+  %temp28.1 = phi float [ %temp28.0, %IF ], [ %605, %ENDIF66 ]
+  %temp29.1 = phi float [ %temp29.0, %IF ], [ %607, %ENDIF66 ]
+  %temp30.1 = phi float [ %temp30.0, %IF ], [ %609, %ENDIF66 ]
+  %temp32.0 = phi float [ 1.000000e+00, %IF ], [ %611, %ENDIF66 ]
+  %259 = fcmp oge float %temp24.1, %195
+  %260 = sext i1 %259 to i32
+  %261 = bitcast i32 %260 to float
+  %262 = bitcast float %261 to i32
+  %263 = icmp ne i32 %262, 0
+  br i1 %263, label %IF67, label %ENDIF66
+
+IF67:                                             ; preds = %LOOP65
+  %264 = bitcast float %136 to i32
+  %265 = bitcast float %182 to i32
+  %266 = bitcast float %137 to i32
+  %267 = bitcast float %183 to i32
+  %268 = bitcast float %temp28.1 to i32
+  %269 = bitcast float %temp29.1 to i32
+  %270 = insertelement <8 x i32> undef, i32 %264, i32 0
+  %271 = insertelement <8 x i32> %270, i32 %265, i32 1
+  %272 = insertelement <8 x i32> %271, i32 %266, i32 2
+  %273 = insertelement <8 x i32> %272, i32 %267, i32 3
+  %274 = insertelement <8 x i32> %273, i32 %268, i32 4
+  %275 = insertelement <8 x i32> %274, i32 %269, i32 5
+  %276 = insertelement <8 x i32> %275, i32 undef, i32 6
+  %277 = insertelement <8 x i32> %276, i32 undef, i32 7
+  %278 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %277, <32 x i8> %66, <16 x i8> %68, i32 2)
+  %279 = extractelement <4 x float> %278, i32 0
+  %280 = extractelement <4 x float> %278, i32 1
+  %281 = extractelement <4 x float> %278, i32 2
+  %282 = extractelement <4 x float> %278, i32 3
+  %283 = fmul float %282, %47
+  %284 = bitcast float %136 to i32
+  %285 = bitcast float %182 to i32
+  %286 = bitcast float %137 to i32
+  %287 = bitcast float %183 to i32
+  %288 = bitcast float %temp28.1 to i32
+  %289 = bitcast float %temp29.1 to i32
+  %290 = insertelement <8 x i32> undef, i32 %284, i32 0
+  %291 = insertelement <8 x i32> %290, i32 %285, i32 1
+  %292 = insertelement <8 x i32> %291, i32 %286, i32 2
+  %293 = insertelement <8 x i32> %292, i32 %287, i32 3
+  %294 = insertelement <8 x i32> %293, i32 %288, i32 4
+  %295 = insertelement <8 x i32> %294, i32 %289, i32 5
+  %296 = insertelement <8 x i32> %295, i32 undef, i32 6
+  %297 = insertelement <8 x i32> %296, i32 undef, i32 7
+  %298 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %297, <32 x i8> %82, <16 x i8> %84, i32 2)
+  %299 = extractelement <4 x float> %298, i32 0
+  %300 = extractelement <4 x float> %298, i32 1
+  %301 = extractelement <4 x float> %298, i32 2
+  %302 = bitcast float %136 to i32
+  %303 = bitcast float %182 to i32
+  %304 = bitcast float %137 to i32
+  %305 = bitcast float %183 to i32
+  %306 = bitcast float %temp28.1 to i32
+  %307 = bitcast float %temp29.1 to i32
+  %308 = insertelement <8 x i32> undef, i32 %302, i32 0
+  %309 = insertelement <8 x i32> %308, i32 %303, i32 1
+  %310 = insertelement <8 x i32> %309, i32 %304, i32 2
+  %311 = insertelement <8 x i32> %310, i32 %305, i32 3
+  %312 = insertelement <8 x i32> %311, i32 %306, i32 4
+  %313 = insertelement <8 x i32> %312, i32 %307, i32 5
+  %314 = insertelement <8 x i32> %313, i32 undef, i32 6
+  %315 = insertelement <8 x i32> %314, i32 undef, i32 7
+  %316 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %315, <32 x i8> %78, <16 x i8> %80, i32 2)
+  %317 = extractelement <4 x float> %316, i32 0
+  %318 = extractelement <4 x float> %316, i32 1
+  %319 = extractelement <4 x float> %316, i32 2
+  %320 = fmul float %317, %23
+  %321 = fmul float %318, %24
+  %322 = fmul float %319, %25
+  %323 = fmul float %299, %26
+  %324 = fadd float %323, %320
+  %325 = fmul float %300, %27
+  %326 = fadd float %325, %321
+  %327 = fmul float %301, %28
+  %328 = fadd float %327, %322
+  %329 = fadd float %279, %324
+  %330 = fadd float %280, %326
+  %331 = fadd float %281, %328
+  %332 = bitcast float %136 to i32
+  %333 = bitcast float %182 to i32
+  %334 = bitcast float %137 to i32
+  %335 = bitcast float %183 to i32
+  %336 = bitcast float %temp28.1 to i32
+  %337 = bitcast float %temp29.1 to i32
+  %338 = insertelement <8 x i32> undef, i32 %332, i32 0
+  %339 = insertelement <8 x i32> %338, i32 %333, i32 1
+  %340 = insertelement <8 x i32> %339, i32 %334, i32 2
+  %341 = insertelement <8 x i32> %340, i32 %335, i32 3
+  %342 = insertelement <8 x i32> %341, i32 %336, i32 4
+  %343 = insertelement <8 x i32> %342, i32 %337, i32 5
+  %344 = insertelement <8 x i32> %343, i32 undef, i32 6
+  %345 = insertelement <8 x i32> %344, i32 undef, i32 7
+  %346 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %345, <32 x i8> %62, <16 x i8> %64, i32 2)
+  %347 = extractelement <4 x float> %346, i32 0
+  %348 = extractelement <4 x float> %346, i32 1
+  %349 = extractelement <4 x float> %346, i32 2
+  %350 = fadd float %347, -5.000000e-01
+  %351 = fadd float %348, -5.000000e-01
+  %352 = fadd float %349, -5.000000e-01
+  %353 = fmul float %350, %350
+  %354 = fmul float %351, %351
+  %355 = fadd float %354, %353
+  %356 = fmul float %352, %352
+  %357 = fadd float %355, %356
+  %358 = call float @llvm.AMDGPU.rsq.f32(float %357)
+  %359 = fmul float %350, %358
+  %360 = fmul float %351, %358
+  %361 = fmul float %352, %358
+  %362 = bitcast float %136 to i32
+  %363 = bitcast float %182 to i32
+  %364 = bitcast float %137 to i32
+  %365 = bitcast float %183 to i32
+  %366 = bitcast float %temp28.1 to i32
+  %367 = bitcast float %temp29.1 to i32
+  %368 = insertelement <8 x i32> undef, i32 %362, i32 0
+  %369 = insertelement <8 x i32> %368, i32 %363, i32 1
+  %370 = insertelement <8 x i32> %369, i32 %364, i32 2
+  %371 = insertelement <8 x i32> %370, i32 %365, i32 3
+  %372 = insertelement <8 x i32> %371, i32 %366, i32 4
+  %373 = insertelement <8 x i32> %372, i32 %367, i32 5
+  %374 = insertelement <8 x i32> %373, i32 undef, i32 6
+  %375 = insertelement <8 x i32> %374, i32 undef, i32 7
+  %376 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %375, <32 x i8> %70, <16 x i8> %72, i32 2)
+  %377 = extractelement <4 x float> %376, i32 0
+  %378 = extractelement <4 x float> %376, i32 1
+  %379 = extractelement <4 x float> %376, i32 2
+  %380 = extractelement <4 x float> %376, i32 3
+  %381 = fsub float -0.000000e+00, %95
+  %382 = fsub float -0.000000e+00, %96
+  %383 = fsub float -0.000000e+00, %97
+  %384 = fmul float %359, %381
+  %385 = fmul float %360, %382
+  %386 = fadd float %385, %384
+  %387 = fmul float %361, %383
+  %388 = fadd float %386, %387
+  %389 = fmul float %388, %359
+  %390 = fmul float %388, %360
+  %391 = fmul float %388, %361
+  %392 = fmul float 2.000000e+00, %389
+  %393 = fmul float 2.000000e+00, %390
+  %394 = fmul float 2.000000e+00, %391
+  %395 = fsub float -0.000000e+00, %392
+  %396 = fadd float %381, %395
+  %397 = fsub float -0.000000e+00, %393
+  %398 = fadd float %382, %397
+  %399 = fsub float -0.000000e+00, %394
+  %400 = fadd float %383, %399
+  %401 = fmul float %396, %98
+  %402 = fmul float %396, %99
+  %403 = fmul float %396, %100
+  %404 = fmul float %398, %101
+  %405 = fadd float %404, %401
+  %406 = fmul float %398, %102
+  %407 = fadd float %406, %402
+  %408 = fmul float %398, %103
+  %409 = fadd float %408, %403
+  %410 = fmul float %400, %104
+  %411 = fadd float %410, %405
+  %412 = fmul float %400, %105
+  %413 = fadd float %412, %407
+  %414 = fmul float %400, %106
+  %415 = fadd float %414, %409
+  %416 = bitcast float %136 to i32
+  %417 = bitcast float %182 to i32
+  %418 = bitcast float %137 to i32
+  %419 = bitcast float %183 to i32
+  %420 = bitcast float %temp28.1 to i32
+  %421 = bitcast float %temp29.1 to i32
+  %422 = insertelement <8 x i32> undef, i32 %416, i32 0
+  %423 = insertelement <8 x i32> %422, i32 %417, i32 1
+  %424 = insertelement <8 x i32> %423, i32 %418, i32 2
+  %425 = insertelement <8 x i32> %424, i32 %419, i32 3
+  %426 = insertelement <8 x i32> %425, i32 %420, i32 4
+  %427 = insertelement <8 x i32> %426, i32 %421, i32 5
+  %428 = insertelement <8 x i32> %427, i32 undef, i32 6
+  %429 = insertelement <8 x i32> %428, i32 undef, i32 7
+  %430 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %429, <32 x i8> %86, <16 x i8> %88, i32 2)
+  %431 = extractelement <4 x float> %430, i32 0
+  %432 = extractelement <4 x float> %430, i32 1
+  %433 = extractelement <4 x float> %430, i32 2
+  %434 = fmul float %48, %411
+  %435 = fmul float %49, %411
+  %436 = fmul float %50, %411
+  %437 = fmul float %51, %413
+  %438 = fadd float %437, %434
+  %439 = fmul float %52, %413
+  %440 = fadd float %439, %435
+  %441 = fmul float %53, %413
+  %442 = fadd float %441, %436
+  %443 = fmul float %54, %415
+  %444 = fadd float %443, %438
+  %445 = fmul float %55, %415
+  %446 = fadd float %445, %440
+  %447 = fmul float %56, %415
+  %448 = fadd float %447, %442
+  %449 = insertelement <4 x float> undef, float %444, i32 0
+  %450 = insertelement <4 x float> %449, float %446, i32 1
+  %451 = insertelement <4 x float> %450, float %448, i32 2
+  %452 = insertelement <4 x float> %451, float %195, i32 3
+  %453 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %452)
+  %454 = extractelement <4 x float> %453, i32 0
+  %455 = extractelement <4 x float> %453, i32 1
+  %456 = extractelement <4 x float> %453, i32 2
+  %457 = extractelement <4 x float> %453, i32 3
+  %458 = call float @fabs(float %456)
+  %459 = fdiv float 1.000000e+00, %458
+  %460 = fmul float %454, %459
+  %461 = fadd float %460, 1.500000e+00
+  %462 = fmul float %455, %459
+  %463 = fadd float %462, 1.500000e+00
+  %464 = bitcast float %463 to i32
+  %465 = bitcast float %461 to i32
+  %466 = bitcast float %457 to i32
+  %467 = insertelement <4 x i32> undef, i32 %464, i32 0
+  %468 = insertelement <4 x i32> %467, i32 %465, i32 1
+  %469 = insertelement <4 x i32> %468, i32 %466, i32 2
+  %470 = insertelement <4 x i32> %469, i32 undef, i32 3
+  %471 = call <4 x float> @llvm.SI.sample.v4i32(<4 x i32> %470, <32 x i8> %90, <16 x i8> %92, i32 4)
+  %472 = extractelement <4 x float> %471, i32 0
+  %473 = extractelement <4 x float> %471, i32 1
+  %474 = extractelement <4 x float> %471, i32 2
+  %475 = fmul float %431, %472
+  %476 = fadd float %475, %329
+  %477 = fmul float %432, %473
+  %478 = fadd float %477, %330
+  %479 = fmul float %433, %474
+  %480 = fadd float %479, %331
+  %481 = fmul float %107, %107
+  %482 = fmul float %108, %108
+  %483 = fadd float %482, %481
+  %484 = fmul float %109, %109
+  %485 = fadd float %483, %484
+  %486 = call float @llvm.AMDGPU.rsq.f32(float %485)
+  %487 = fmul float %107, %486
+  %488 = fmul float %108, %486
+  %489 = fmul float %109, %486
+  %490 = fmul float %377, %40
+  %491 = fmul float %378, %41
+  %492 = fmul float %379, %42
+  %493 = fmul float %359, %487
+  %494 = fmul float %360, %488
+  %495 = fadd float %494, %493
+  %496 = fmul float %361, %489
+  %497 = fadd float %495, %496
+  %498 = fmul float %497, %359
+  %499 = fmul float %497, %360
+  %500 = fmul float %497, %361
+  %501 = fmul float 2.000000e+00, %498
+  %502 = fmul float 2.000000e+00, %499
+  %503 = fmul float 2.000000e+00, %500
+  %504 = fsub float -0.000000e+00, %501
+  %505 = fadd float %487, %504
+  %506 = fsub float -0.000000e+00, %502
+  %507 = fadd float %488, %506
+  %508 = fsub float -0.000000e+00, %503
+  %509 = fadd float %489, %508
+  %510 = fmul float %95, %95
+  %511 = fmul float %96, %96
+  %512 = fadd float %511, %510
+  %513 = fmul float %97, %97
+  %514 = fadd float %512, %513
+  %515 = call float @llvm.AMDGPU.rsq.f32(float %514)
+  %516 = fmul float %95, %515
+  %517 = fmul float %96, %515
+  %518 = fmul float %97, %515
+  %519 = fmul float %505, %516
+  %520 = fmul float %507, %517
+  %521 = fadd float %520, %519
+  %522 = fmul float %509, %518
+  %523 = fadd float %521, %522
+  %524 = fsub float -0.000000e+00, %523
+  %525 = fcmp uge float %524, 0.000000e+00
+  %526 = select i1 %525, float %524, float 0.000000e+00
+  %527 = fmul float %43, %380
+  %528 = fadd float %527, 1.000000e+00
+  %529 = call float @llvm.pow.f32(float %526, float %528)
+  %530 = fmul float %476, %37
+  %531 = fmul float %478, %38
+  %532 = fmul float %480, %39
+  %533 = fmul float %359, %487
+  %534 = fmul float %360, %488
+  %535 = fadd float %534, %533
+  %536 = fmul float %361, %489
+  %537 = fadd float %535, %536
+  %538 = fcmp uge float %537, 0.000000e+00
+  %539 = select i1 %538, float %537, float 0.000000e+00
+  %540 = fmul float %530, %539
+  %541 = fmul float %531, %539
+  %542 = fmul float %532, %539
+  %543 = fmul float %490, %529
+  %544 = fadd float %543, %540
+  %545 = fmul float %491, %529
+  %546 = fadd float %545, %541
+  %547 = fmul float %492, %529
+  %548 = fadd float %547, %542
+  %549 = fmul float %476, %34
+  %550 = fmul float %478, %35
+  %551 = fmul float %480, %36
+  %552 = fmul float %544, %57
+  %553 = fadd float %552, %549
+  %554 = fmul float %546, %58
+  %555 = fadd float %554, %550
+  %556 = fmul float %548, %59
+  %557 = fadd float %556, %551
+  %558 = bitcast float %136 to i32
+  %559 = bitcast float %182 to i32
+  %560 = bitcast float %137 to i32
+  %561 = bitcast float %183 to i32
+  %562 = bitcast float %temp28.1 to i32
+  %563 = bitcast float %temp29.1 to i32
+  %564 = insertelement <8 x i32> undef, i32 %558, i32 0
+  %565 = insertelement <8 x i32> %564, i32 %559, i32 1
+  %566 = insertelement <8 x i32> %565, i32 %560, i32 2
+  %567 = insertelement <8 x i32> %566, i32 %561, i32 3
+  %568 = insertelement <8 x i32> %567, i32 %562, i32 4
+  %569 = insertelement <8 x i32> %568, i32 %563, i32 5
+  %570 = insertelement <8 x i32> %569, i32 undef, i32 6
+  %571 = insertelement <8 x i32> %570, i32 undef, i32 7
+  %572 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %571, <32 x i8> %74, <16 x i8> %76, i32 2)
+  %573 = extractelement <4 x float> %572, i32 0
+  %574 = extractelement <4 x float> %572, i32 1
+  %575 = extractelement <4 x float> %572, i32 2
+  %576 = fmul float %573, %44
+  %577 = fadd float %576, %553
+  %578 = fmul float %574, %45
+  %579 = fadd float %578, %555
+  %580 = fmul float %575, %46
+  %581 = fadd float %580, %557
+  %582 = call i32 @llvm.SI.packf16(float %577, float %579)
+  %583 = bitcast i32 %582 to float
+  %584 = call i32 @llvm.SI.packf16(float %581, float %283)
+  %585 = bitcast i32 %584 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %583, float %585, float %583, float %585)
+  ret void
+
+ENDIF66:                                          ; preds = %LOOP65
+  %586 = bitcast float %temp28.1 to i32
+  %587 = bitcast float %temp29.1 to i32
+  %588 = insertelement <8 x i32> %237, i32 %586, i32 4
+  %589 = insertelement <8 x i32> %588, i32 %587, i32 5
+  %590 = insertelement <8 x i32> %589, i32 undef, i32 6
+  %591 = insertelement <8 x i32> %590, i32 undef, i32 7
+  %592 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %591, <32 x i8> %62, <16 x i8> %64, i32 2)
+  %593 = extractelement <4 x float> %592, i32 3
+  %594 = fcmp oge float %temp30.1, %593
+  %595 = sext i1 %594 to i32
+  %596 = bitcast i32 %595 to float
+  %597 = bitcast float %596 to i32
+  %598 = and i32 %597, 1065353216
+  %599 = bitcast i32 %598 to float
+  %600 = fmul float 5.000000e-01, %temp32.0
+  %601 = fsub float -0.000000e+00, %600
+  %602 = fmul float %599, %temp32.0
+  %603 = fadd float %602, %601
+  %604 = fmul float %214, %603
+  %605 = fadd float %604, %temp28.1
+  %606 = fmul float %215, %603
+  %607 = fadd float %606, %temp29.1
+  %608 = fmul float %216, %603
+  %609 = fadd float %608, %temp30.1
+  %610 = fadd float %temp24.1, 1.000000e+00
+  %611 = fmul float %temp32.0, 5.000000e-01
+  br label %LOOP65
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: readnone
+declare i32 @llvm.SI.tid() #2
+
+; Function Attrs: readonly
+declare float @ceil(float) #3
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq.f32(float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #2
+
+; Function Attrs: readnone
+declare float @fabs(float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: nounwind readonly
+declare float @llvm.pow.f32(float, float) #4
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
+attributes #3 = { readonly }
+attributes #4 = { nounwind readonly }
+
+!0 = !{!"const", null, i32 1}
+
+; CHECK-LABEL: {{^}}main1:
+; CHECK: s_endpgm
+define void @main1([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
+  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0
+  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 0)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 4)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 8)
+  %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 12)
+  %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 28)
+  %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 48)
+  %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 52)
+  %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 56)
+  %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 64)
+  %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 68)
+  %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 72)
+  %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 76)
+  %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128)
+  %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132)
+  %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144)
+  %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 148)
+  %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 152)
+  %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160)
+  %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 164)
+  %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 168)
+  %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 172)
+  %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176)
+  %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180)
+  %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184)
+  %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192)
+  %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196)
+  %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200)
+  %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208)
+  %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212)
+  %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216)
+  %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 220)
+  %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 236)
+  %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240)
+  %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244)
+  %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248)
+  %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 252)
+  %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256)
+  %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 260)
+  %61 = call float @llvm.SI.load.const(<16 x i8> %22, i32 264)
+  %62 = call float @llvm.SI.load.const(<16 x i8> %22, i32 268)
+  %63 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272)
+  %64 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276)
+  %65 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280)
+  %66 = call float @llvm.SI.load.const(<16 x i8> %22, i32 284)
+  %67 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288)
+  %68 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292)
+  %69 = call float @llvm.SI.load.const(<16 x i8> %22, i32 464)
+  %70 = call float @llvm.SI.load.const(<16 x i8> %22, i32 468)
+  %71 = call float @llvm.SI.load.const(<16 x i8> %22, i32 472)
+  %72 = call float @llvm.SI.load.const(<16 x i8> %22, i32 496)
+  %73 = call float @llvm.SI.load.const(<16 x i8> %22, i32 500)
+  %74 = call float @llvm.SI.load.const(<16 x i8> %22, i32 504)
+  %75 = call float @llvm.SI.load.const(<16 x i8> %22, i32 512)
+  %76 = call float @llvm.SI.load.const(<16 x i8> %22, i32 516)
+  %77 = call float @llvm.SI.load.const(<16 x i8> %22, i32 524)
+  %78 = call float @llvm.SI.load.const(<16 x i8> %22, i32 532)
+  %79 = call float @llvm.SI.load.const(<16 x i8> %22, i32 536)
+  %80 = call float @llvm.SI.load.const(<16 x i8> %22, i32 540)
+  %81 = call float @llvm.SI.load.const(<16 x i8> %22, i32 544)
+  %82 = call float @llvm.SI.load.const(<16 x i8> %22, i32 548)
+  %83 = call float @llvm.SI.load.const(<16 x i8> %22, i32 552)
+  %84 = call float @llvm.SI.load.const(<16 x i8> %22, i32 556)
+  %85 = call float @llvm.SI.load.const(<16 x i8> %22, i32 560)
+  %86 = call float @llvm.SI.load.const(<16 x i8> %22, i32 564)
+  %87 = call float @llvm.SI.load.const(<16 x i8> %22, i32 568)
+  %88 = call float @llvm.SI.load.const(<16 x i8> %22, i32 572)
+  %89 = call float @llvm.SI.load.const(<16 x i8> %22, i32 576)
+  %90 = call float @llvm.SI.load.const(<16 x i8> %22, i32 580)
+  %91 = call float @llvm.SI.load.const(<16 x i8> %22, i32 584)
+  %92 = call float @llvm.SI.load.const(<16 x i8> %22, i32 588)
+  %93 = call float @llvm.SI.load.const(<16 x i8> %22, i32 592)
+  %94 = call float @llvm.SI.load.const(<16 x i8> %22, i32 596)
+  %95 = call float @llvm.SI.load.const(<16 x i8> %22, i32 600)
+  %96 = call float @llvm.SI.load.const(<16 x i8> %22, i32 604)
+  %97 = call float @llvm.SI.load.const(<16 x i8> %22, i32 608)
+  %98 = call float @llvm.SI.load.const(<16 x i8> %22, i32 612)
+  %99 = call float @llvm.SI.load.const(<16 x i8> %22, i32 616)
+  %100 = call float @llvm.SI.load.const(<16 x i8> %22, i32 624)
+  %101 = call float @llvm.SI.load.const(<16 x i8> %22, i32 628)
+  %102 = call float @llvm.SI.load.const(<16 x i8> %22, i32 632)
+  %103 = call float @llvm.SI.load.const(<16 x i8> %22, i32 636)
+  %104 = call float @llvm.SI.load.const(<16 x i8> %22, i32 640)
+  %105 = call float @llvm.SI.load.const(<16 x i8> %22, i32 644)
+  %106 = call float @llvm.SI.load.const(<16 x i8> %22, i32 648)
+  %107 = call float @llvm.SI.load.const(<16 x i8> %22, i32 652)
+  %108 = call float @llvm.SI.load.const(<16 x i8> %22, i32 656)
+  %109 = call float @llvm.SI.load.const(<16 x i8> %22, i32 660)
+  %110 = call float @llvm.SI.load.const(<16 x i8> %22, i32 664)
+  %111 = call float @llvm.SI.load.const(<16 x i8> %22, i32 668)
+  %112 = call float @llvm.SI.load.const(<16 x i8> %22, i32 672)
+  %113 = call float @llvm.SI.load.const(<16 x i8> %22, i32 676)
+  %114 = call float @llvm.SI.load.const(<16 x i8> %22, i32 680)
+  %115 = call float @llvm.SI.load.const(<16 x i8> %22, i32 684)
+  %116 = call float @llvm.SI.load.const(<16 x i8> %22, i32 688)
+  %117 = call float @llvm.SI.load.const(<16 x i8> %22, i32 692)
+  %118 = call float @llvm.SI.load.const(<16 x i8> %22, i32 696)
+  %119 = call float @llvm.SI.load.const(<16 x i8> %22, i32 700)
+  %120 = call float @llvm.SI.load.const(<16 x i8> %22, i32 704)
+  %121 = call float @llvm.SI.load.const(<16 x i8> %22, i32 708)
+  %122 = call float @llvm.SI.load.const(<16 x i8> %22, i32 712)
+  %123 = call float @llvm.SI.load.const(<16 x i8> %22, i32 716)
+  %124 = call float @llvm.SI.load.const(<16 x i8> %22, i32 864)
+  %125 = call float @llvm.SI.load.const(<16 x i8> %22, i32 868)
+  %126 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
+  %127 = load <32 x i8>, <32 x i8> addrspace(2)* %126, !tbaa !0
+  %128 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
+  %129 = load <16 x i8>, <16 x i8> addrspace(2)* %128, !tbaa !0
+  %130 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
+  %131 = load <32 x i8>, <32 x i8> addrspace(2)* %130, !tbaa !0
+  %132 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
+  %133 = load <16 x i8>, <16 x i8> addrspace(2)* %132, !tbaa !0
+  %134 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
+  %135 = load <32 x i8>, <32 x i8> addrspace(2)* %134, !tbaa !0
+  %136 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
+  %137 = load <16 x i8>, <16 x i8> addrspace(2)* %136, !tbaa !0
+  %138 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
+  %139 = load <32 x i8>, <32 x i8> addrspace(2)* %138, !tbaa !0
+  %140 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
+  %141 = load <16 x i8>, <16 x i8> addrspace(2)* %140, !tbaa !0
+  %142 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
+  %143 = load <32 x i8>, <32 x i8> addrspace(2)* %142, !tbaa !0
+  %144 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
+  %145 = load <16 x i8>, <16 x i8> addrspace(2)* %144, !tbaa !0
+  %146 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
+  %147 = load <32 x i8>, <32 x i8> addrspace(2)* %146, !tbaa !0
+  %148 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
+  %149 = load <16 x i8>, <16 x i8> addrspace(2)* %148, !tbaa !0
+  %150 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
+  %151 = load <32 x i8>, <32 x i8> addrspace(2)* %150, !tbaa !0
+  %152 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
+  %153 = load <16 x i8>, <16 x i8> addrspace(2)* %152, !tbaa !0
+  %154 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
+  %155 = load <32 x i8>, <32 x i8> addrspace(2)* %154, !tbaa !0
+  %156 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
+  %157 = load <16 x i8>, <16 x i8> addrspace(2)* %156, !tbaa !0
+  %158 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 8
+  %159 = load <32 x i8>, <32 x i8> addrspace(2)* %158, !tbaa !0
+  %160 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 8
+  %161 = load <16 x i8>, <16 x i8> addrspace(2)* %160, !tbaa !0
+  %162 = fcmp ugt float %17, 0.000000e+00
+  %163 = select i1 %162, float 1.000000e+00, float 0.000000e+00
+  %164 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
+  %165 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6)
+  %166 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %4, <2 x i32> %6)
+  %167 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %4, <2 x i32> %6)
+  %168 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6)
+  %169 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6)
+  %170 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6)
+  %171 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %4, <2 x i32> %6)
+  %172 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6)
+  %173 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6)
+  %174 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6)
+  %175 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %4, <2 x i32> %6)
+  %176 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6)
+  %177 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6)
+  %178 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6)
+  %179 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %4, <2 x i32> %6)
+  %180 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6)
+  %181 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6)
+  %182 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6)
+  %183 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %4, <2 x i32> %6)
+  %184 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6)
+  %185 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6)
+  %186 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6)
+  %187 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %4, <2 x i32> %6)
+  %188 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %4, <2 x i32> %6)
+  %189 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %4, <2 x i32> %6)
+  %190 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %4, <2 x i32> %6)
+  %191 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %4, <2 x i32> %6)
+  %192 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %4, <2 x i32> %6)
+  %193 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %4, <2 x i32> %6)
+  %194 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %4, <2 x i32> %6)
+  %195 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %4, <2 x i32> %6)
+  %196 = fmul float %14, %124
+  %197 = fadd float %196, %125
+  %198 = call float @llvm.AMDIL.clamp.(float %163, float 0.000000e+00, float 1.000000e+00)
+  %199 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %200 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %201 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %202 = bitcast float %198 to i32
+  %203 = icmp ne i32 %202, 0
+  %. = select i1 %203, float -1.000000e+00, float 1.000000e+00
+  %204 = fsub float -0.000000e+00, %164
+  %205 = fadd float %44, %204
+  %206 = fsub float -0.000000e+00, %165
+  %207 = fadd float %45, %206
+  %208 = fsub float -0.000000e+00, %166
+  %209 = fadd float %46, %208
+  %210 = fmul float %205, %205
+  %211 = fmul float %207, %207
+  %212 = fadd float %211, %210
+  %213 = fmul float %209, %209
+  %214 = fadd float %212, %213
+  %215 = call float @llvm.AMDGPU.rsq.f32(float %214)
+  %216 = fmul float %205, %215
+  %217 = fmul float %207, %215
+  %218 = fmul float %209, %215
+  %219 = fmul float %., %54
+  %220 = fmul float %13, %47
+  %221 = fmul float %197, %48
+  %222 = bitcast float %174 to i32
+  %223 = bitcast float %175 to i32
+  %224 = insertelement <2 x i32> undef, i32 %222, i32 0
+  %225 = insertelement <2 x i32> %224, i32 %223, i32 1
+  %226 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %225, <32 x i8> %131, <16 x i8> %133, i32 2)
+  %227 = extractelement <4 x float> %226, i32 0
+  %228 = extractelement <4 x float> %226, i32 1
+  %229 = extractelement <4 x float> %226, i32 2
+  %230 = extractelement <4 x float> %226, i32 3
+  %231 = fmul float %227, 0x4012611180000000
+  %232 = fmul float %228, 0x4012611180000000
+  %233 = fmul float %229, 0x4012611180000000
+  %234 = call float @llvm.AMDGPU.lrp(float %27, float %231, float 1.000000e+00)
+  %235 = call float @llvm.AMDGPU.lrp(float %27, float %232, float 1.000000e+00)
+  %236 = call float @llvm.AMDGPU.lrp(float %27, float %233, float 1.000000e+00)
+  %237 = fmul float %216, %184
+  %238 = fmul float %217, %185
+  %239 = fadd float %238, %237
+  %240 = fmul float %218, %186
+  %241 = fadd float %239, %240
+  %242 = fmul float %216, %187
+  %243 = fmul float %217, %188
+  %244 = fadd float %243, %242
+  %245 = fmul float %218, %189
+  %246 = fadd float %244, %245
+  %247 = fmul float %216, %190
+  %248 = fmul float %217, %191
+  %249 = fadd float %248, %247
+  %250 = fmul float %218, %192
+  %251 = fadd float %249, %250
+  %252 = call float @llvm.AMDIL.clamp.(float %251, float 0.000000e+00, float 1.000000e+00)
+  %253 = fmul float %214, 0x3F5A36E2E0000000
+  %254 = call float @llvm.AMDIL.clamp.(float %253, float 0.000000e+00, float 1.000000e+00)
+  %255 = fsub float -0.000000e+00, %254
+  %256 = fadd float 1.000000e+00, %255
+  %257 = call float @llvm.pow.f32(float %252, float 2.500000e-01)
+  %258 = fmul float %39, %257
+  %259 = fmul float %241, %258
+  %260 = fmul float %246, %258
+  %261 = fmul float %259, %230
+  %262 = fmul float %260, %230
+  %263 = fadd float %252, 0x3EE4F8B580000000
+  %264 = fsub float -0.000000e+00, %252
+  %265 = fadd float 1.000000e+00, %264
+  %266 = fmul float 1.200000e+01, %265
+  %267 = fadd float %266, 4.000000e+00
+  %268 = fsub float -0.000000e+00, %267
+  %269 = fmul float %268, %263
+  %270 = fsub float -0.000000e+00, %267
+  %271 = fmul float %270, %263
+  %272 = fsub float -0.000000e+00, %267
+  %273 = fmul float %272, %263
+  %274 = fdiv float 1.000000e+00, %269
+  %275 = fdiv float 1.000000e+00, %271
+  %276 = fdiv float 1.000000e+00, %273
+  %277 = fmul float %261, %274
+  %278 = fmul float %262, %275
+  %279 = fmul float %263, %276
+  br label %LOOP
+
+LOOP:                                             ; preds = %LOOP, %main_body
+  %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %292, %LOOP ]
+  %temp168.0 = phi float [ %176, %main_body ], [ %288, %LOOP ]
+  %temp169.0 = phi float [ %177, %main_body ], [ %289, %LOOP ]
+  %temp170.0 = phi float [ %256, %main_body ], [ %290, %LOOP ]
+  %280 = bitcast float %temp168.0 to i32
+  %281 = bitcast float %temp169.0 to i32
+  %282 = insertelement <4 x i32> undef, i32 %280, i32 0
+  %283 = insertelement <4 x i32> %282, i32 %281, i32 1
+  %284 = insertelement <4 x i32> %283, i32 0, i32 2
+  %285 = insertelement <4 x i32> %284, i32 undef, i32 3
+  %286 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %285, <32 x i8> %147, <16 x i8> %149, i32 2)
+  %287 = extractelement <4 x float> %286, i32 3
+  %288 = fadd float %temp168.0, %277
+  %289 = fadd float %temp169.0, %278
+  %290 = fadd float %temp170.0, %279
+  %291 = fsub float -0.000000e+00, %287
+  %292 = fadd float %290, %291
+  %293 = fcmp oge float 0.000000e+00, %292
+  %294 = sext i1 %293 to i32
+  %295 = bitcast i32 %294 to float
+  %296 = bitcast float %295 to i32
+  %297 = icmp ne i32 %296, 0
+  br i1 %297, label %IF189, label %LOOP
+
+IF189:                                            ; preds = %LOOP
+  %298 = extractelement <4 x float> %286, i32 0
+  %299 = extractelement <4 x float> %286, i32 1
+  %300 = extractelement <4 x float> %286, i32 2
+  %301 = fsub float -0.000000e+00, %292
+  %302 = fadd float %temp144.0, %301
+  %303 = fdiv float 1.000000e+00, %302
+  %304 = fmul float %292, %303
+  %305 = fadd float %304, -1.000000e+00
+  %306 = fmul float %305, %277
+  %307 = fadd float %306, %288
+  %308 = fmul float %305, %278
+  %309 = fadd float %308, %289
+  %310 = fsub float -0.000000e+00, %176
+  %311 = fadd float %307, %310
+  %312 = fsub float -0.000000e+00, %177
+  %313 = fadd float %309, %312
+  %314 = fadd float %176, %311
+  %315 = fadd float %177, %313
+  %316 = fmul float %311, %67
+  %317 = fmul float %313, %68
+  %318 = fmul float %316, %55
+  %319 = fmul float %316, %56
+  %320 = fmul float %317, %57
+  %321 = fadd float %320, %318
+  %322 = fmul float %317, %58
+  %323 = fadd float %322, %319
+  %324 = fadd float %178, %321
+  %325 = fadd float %179, %323
+  %326 = fmul float %316, %59
+  %327 = fmul float %316, %60
+  %328 = fmul float %316, %61
+  %329 = fmul float %316, %62
+  %330 = fmul float %317, %63
+  %331 = fadd float %330, %326
+  %332 = fmul float %317, %64
+  %333 = fadd float %332, %327
+  %334 = fmul float %317, %65
+  %335 = fadd float %334, %328
+  %336 = fmul float %317, %66
+  %337 = fadd float %336, %329
+  %338 = fadd float %168, %331
+  %339 = fadd float %169, %333
+  %340 = fadd float %170, %335
+  %341 = fadd float %171, %337
+  %342 = bitcast float %338 to i32
+  %343 = bitcast float %339 to i32
+  %344 = insertelement <2 x i32> undef, i32 %342, i32 0
+  %345 = insertelement <2 x i32> %344, i32 %343, i32 1
+  %346 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %345, <32 x i8> %135, <16 x i8> %137, i32 2)
+  %347 = extractelement <4 x float> %346, i32 0
+  %348 = extractelement <4 x float> %346, i32 1
+  %349 = extractelement <4 x float> %346, i32 2
+  %350 = extractelement <4 x float> %346, i32 3
+  %351 = fmul float %347, %23
+  %352 = fmul float %348, %24
+  %353 = fmul float %349, %25
+  %354 = fmul float %350, %26
+  %355 = fmul float %351, %180
+  %356 = fmul float %352, %181
+  %357 = fmul float %353, %182
+  %358 = fmul float %354, %183
+  %359 = fsub float -0.000000e+00, %350
+  %360 = fadd float 1.000000e+00, %359
+  %361 = fmul float %360, %49
+  %362 = call float @llvm.AMDGPU.lrp(float %361, float %347, float %355)
+  %363 = call float @llvm.AMDGPU.lrp(float %361, float %348, float %356)
+  %364 = call float @llvm.AMDGPU.lrp(float %361, float %349, float %357)
+  %365 = bitcast float %340 to i32
+  %366 = bitcast float %341 to i32
+  %367 = insertelement <2 x i32> undef, i32 %365, i32 0
+  %368 = insertelement <2 x i32> %367, i32 %366, i32 1
+  %369 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %368, <32 x i8> %151, <16 x i8> %153, i32 2)
+  %370 = extractelement <4 x float> %369, i32 2
+  %371 = fmul float %362, %234
+  %372 = fmul float %363, %235
+  %373 = fmul float %364, %236
+  %374 = fmul float %358, %230
+  %375 = bitcast float %314 to i32
+  %376 = bitcast float %315 to i32
+  %377 = insertelement <2 x i32> undef, i32 %375, i32 0
+  %378 = insertelement <2 x i32> %377, i32 %376, i32 1
+  %379 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %378, <32 x i8> %139, <16 x i8> %141, i32 2)
+  %380 = extractelement <4 x float> %379, i32 0
+  %381 = extractelement <4 x float> %379, i32 1
+  %382 = extractelement <4 x float> %379, i32 2
+  %383 = extractelement <4 x float> %379, i32 3
+  %384 = fcmp olt float 0.000000e+00, %382
+  %385 = sext i1 %384 to i32
+  %386 = bitcast i32 %385 to float
+  %387 = bitcast float %386 to i32
+  %388 = icmp ne i32 %387, 0
+  %.224 = select i1 %388, float %381, float %380
+  %.225 = select i1 %388, float %383, float %381
+  %389 = bitcast float %324 to i32
+  %390 = bitcast float %325 to i32
+  %391 = insertelement <2 x i32> undef, i32 %389, i32 0
+  %392 = insertelement <2 x i32> %391, i32 %390, i32 1
+  %393 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %392, <32 x i8> %143, <16 x i8> %145, i32 2)
+  %394 = extractelement <4 x float> %393, i32 0
+  %395 = extractelement <4 x float> %393, i32 1
+  %396 = extractelement <4 x float> %393, i32 2
+  %397 = extractelement <4 x float> %393, i32 3
+  %398 = fcmp olt float 0.000000e+00, %396
+  %399 = sext i1 %398 to i32
+  %400 = bitcast i32 %399 to float
+  %401 = bitcast float %400 to i32
+  %402 = icmp ne i32 %401, 0
+  %temp112.1 = select i1 %402, float %395, float %394
+  %temp113.1 = select i1 %402, float %397, float %395
+  %403 = fmul float %.224, 2.000000e+00
+  %404 = fadd float %403, -1.000000e+00
+  %405 = fmul float %.225, 2.000000e+00
+  %406 = fadd float %405, -1.000000e+00
+  %407 = fmul float %temp112.1, 2.000000e+00
+  %408 = fadd float %407, -1.000000e+00
+  %409 = fmul float %temp113.1, 2.000000e+00
+  %410 = fadd float %409, -1.000000e+00
+  %411 = fsub float -0.000000e+00, %404
+  %412 = fmul float %411, %35
+  %413 = fsub float -0.000000e+00, %406
+  %414 = fmul float %413, %35
+  %415 = fsub float -0.000000e+00, %408
+  %416 = fmul float %415, %36
+  %417 = fsub float -0.000000e+00, %410
+  %418 = fmul float %417, %36
+  %419 = fmul float %416, %370
+  %420 = fmul float %418, %370
+  %421 = call float @fabs(float %412)
+  %422 = call float @fabs(float %414)
+  %423 = fsub float -0.000000e+00, %421
+  %424 = fadd float 1.000000e+00, %423
+  %425 = fsub float -0.000000e+00, %422
+  %426 = fadd float 1.000000e+00, %425
+  %427 = fmul float %424, %419
+  %428 = fadd float %427, %412
+  %429 = fmul float %426, %420
+  %430 = fadd float %429, %414
+  %431 = fmul float %428, %428
+  %432 = fmul float %430, %430
+  %433 = fadd float %431, %432
+  %434 = fsub float -0.000000e+00, %433
+  %435 = fadd float 0x3FF00068E0000000, %434
+  %436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00)
+  %437 = call float @llvm.AMDGPU.rsq.f32(float %436)
+  %438 = fmul float %437, %436
+  %439 = fsub float -0.000000e+00, %436
+  %440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00)
+  %441 = fmul float %184, %428
+  %442 = fmul float %185, %428
+  %443 = fmul float %186, %428
+  %444 = fmul float %187, %430
+  %445 = fadd float %444, %441
+  %446 = fmul float %188, %430
+  %447 = fadd float %446, %442
+  %448 = fmul float %189, %430
+  %449 = fadd float %448, %443
+  %450 = fmul float %190, %440
+  %451 = fadd float %450, %445
+  %452 = fmul float %191, %440
+  %453 = fadd float %452, %447
+  %454 = fmul float %192, %440
+  %455 = fadd float %454, %449
+  %456 = fmul float %451, %451
+  %457 = fmul float %453, %453
+  %458 = fadd float %457, %456
+  %459 = fmul float %455, %455
+  %460 = fadd float %458, %459
+  %461 = call float @llvm.AMDGPU.rsq.f32(float %460)
+  %462 = fmul float %451, %461
+  %463 = fmul float %453, %461
+  %464 = fmul float %455, %461
+  %465 = fcmp olt float 0.000000e+00, %219
+  %466 = sext i1 %465 to i32
+  %467 = bitcast i32 %466 to float
+  %468 = bitcast float %467 to i32
+  %469 = icmp ne i32 %468, 0
+  br i1 %469, label %IF198, label %ENDIF197
+
+IF198:                                            ; preds = %IF189
+  %470 = fsub float -0.000000e+00, %462
+  %471 = fsub float -0.000000e+00, %463
+  %472 = fsub float -0.000000e+00, %464
+  br label %ENDIF197
+
+ENDIF197:                                         ; preds = %IF189, %IF198
+  %temp14.0 = phi float [ %472, %IF198 ], [ %464, %IF189 ]
+  %temp13.0 = phi float [ %471, %IF198 ], [ %463, %IF189 ]
+  %temp12.0 = phi float [ %470, %IF198 ], [ %462, %IF189 ]
+  %473 = bitcast float %220 to i32
+  %474 = bitcast float %221 to i32
+  %475 = insertelement <2 x i32> undef, i32 %473, i32 0
+  %476 = insertelement <2 x i32> %475, i32 %474, i32 1
+  %477 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %476, <32 x i8> %159, <16 x i8> %161, i32 2)
+  %478 = extractelement <4 x float> %477, i32 0
+  %479 = extractelement <4 x float> %477, i32 1
+  %480 = extractelement <4 x float> %477, i32 2
+  %481 = extractelement <4 x float> %477, i32 3
+  %482 = fmul float %478, %40
+  %483 = fadd float %482, %41
+  %484 = fmul float %479, %40
+  %485 = fadd float %484, %41
+  %486 = fmul float %480, %40
+  %487 = fadd float %486, %41
+  %488 = fmul float %481, %42
+  %489 = fadd float %488, %43
+  %490 = bitcast float %172 to i32
+  %491 = bitcast float %173 to i32
+  %492 = insertelement <2 x i32> undef, i32 %490, i32 0
+  %493 = insertelement <2 x i32> %492, i32 %491, i32 1
+  %494 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %493, <32 x i8> %155, <16 x i8> %157, i32 2)
+  %495 = extractelement <4 x float> %494, i32 0
+  %496 = extractelement <4 x float> %494, i32 1
+  %497 = extractelement <4 x float> %494, i32 2
+  %498 = extractelement <4 x float> %494, i32 3
+  %499 = fmul float %498, 3.200000e+01
+  %500 = fadd float %499, -1.600000e+01
+  %501 = call float @llvm.AMDIL.exp.(float %500)
+  %502 = fmul float %495, %501
+  %503 = fmul float %496, %501
+  %504 = fmul float %497, %501
+  %505 = fmul float %28, %502
+  %506 = fadd float %505, %193
+  %507 = fmul float %29, %503
+  %508 = fadd float %507, %194
+  %509 = fmul float %30, %504
+  %510 = fadd float %509, %195
+  %511 = fmul float %506, %489
+  %512 = fmul float %508, %489
+  %513 = fmul float %510, %489
+  %514 = fmul float %489, 5.000000e-01
+  %515 = fadd float %514, 5.000000e-01
+  %516 = fmul float %483, %515
+  %517 = fadd float %516, %511
+  %518 = fmul float %485, %515
+  %519 = fadd float %518, %512
+  %520 = fmul float %487, %515
+  %521 = fadd float %520, %513
+  %522 = fmul float %517, %371
+  %523 = fmul float %519, %372
+  %524 = fmul float %521, %373
+  %525 = fmul float %428, 0x3FDB272440000000
+  %526 = fmul float %430, 0xBFDB272440000000
+  %527 = fadd float %526, %525
+  %528 = fmul float %440, 0x3FE99999A0000000
+  %529 = fadd float %527, %528
+  %530 = fmul float %529, 5.000000e-01
+  %531 = fadd float %530, 0x3FE3333340000000
+  %532 = fmul float %531, %531
+  %533 = fmul float %522, %532
+  %534 = fmul float %523, %532
+  %535 = fmul float %524, %532
+  %536 = fsub float -0.000000e+00, %72
+  %537 = fsub float -0.000000e+00, %73
+  %538 = fsub float -0.000000e+00, %74
+  %539 = fmul float %temp12.0, %536
+  %540 = fmul float %temp13.0, %537
+  %541 = fadd float %540, %539
+  %542 = fmul float %temp14.0, %538
+  %543 = fadd float %541, %542
+  %544 = call float @llvm.AMDIL.clamp.(float %543, float 0.000000e+00, float 1.000000e+00)
+  %545 = fmul float %371, %544
+  %546 = fmul float %372, %544
+  %547 = fmul float %373, %544
+  %548 = fmul float %545, %69
+  %549 = fmul float %546, %70
+  %550 = fmul float %547, %71
+  %551 = fsub float -0.000000e+00, %164
+  %552 = fadd float %97, %551
+  %553 = fsub float -0.000000e+00, %165
+  %554 = fadd float %98, %553
+  %555 = fsub float -0.000000e+00, %166
+  %556 = fadd float %99, %555
+  %557 = fmul float %552, %552
+  %558 = fmul float %554, %554
+  %559 = fadd float %558, %557
+  %560 = fmul float %556, %556
+  %561 = fadd float %559, %560
+  %562 = call float @llvm.AMDGPU.rsq.f32(float %561)
+  %563 = fmul float %562, %561
+  %564 = fsub float -0.000000e+00, %561
+  %565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00)
+  %566 = fsub float -0.000000e+00, %84
+  %567 = fadd float %565, %566
+  %568 = fsub float -0.000000e+00, %83
+  %569 = fadd float %565, %568
+  %570 = fsub float -0.000000e+00, %82
+  %571 = fadd float %565, %570
+  %572 = fsub float -0.000000e+00, %84
+  %573 = fadd float %83, %572
+  %574 = fsub float -0.000000e+00, %83
+  %575 = fadd float %82, %574
+  %576 = fsub float -0.000000e+00, %82
+  %577 = fadd float %81, %576
+  %578 = fdiv float 1.000000e+00, %573
+  %579 = fdiv float 1.000000e+00, %575
+  %580 = fdiv float 1.000000e+00, %577
+  %581 = fmul float %567, %578
+  %582 = fmul float %569, %579
+  %583 = fmul float %571, %580
+  %584 = fcmp olt float %565, %83
+  %585 = sext i1 %584 to i32
+  %586 = bitcast i32 %585 to float
+  %587 = bitcast float %586 to i32
+  %588 = icmp ne i32 %587, 0
+  br i1 %588, label %ENDIF200, label %ELSE202
+
+ELSE202:                                          ; preds = %ENDIF197
+  %589 = fcmp olt float %565, %82
+  %590 = sext i1 %589 to i32
+  %591 = bitcast i32 %590 to float
+  %592 = bitcast float %591 to i32
+  %593 = icmp ne i32 %592, 0
+  br i1 %593, label %ENDIF200, label %ELSE205
+
+ENDIF200:                                         ; preds = %ELSE205, %ELSE202, %ENDIF197
+  %temp80.0 = phi float [ %581, %ENDIF197 ], [ %.226, %ELSE205 ], [ %582, %ELSE202 ]
+  %temp88.0 = phi float [ %122, %ENDIF197 ], [ %.227, %ELSE205 ], [ %120, %ELSE202 ]
+  %temp89.0 = phi float [ %123, %ENDIF197 ], [ %.228, %ELSE205 ], [ %121, %ELSE202 ]
+  %temp90.0 = phi float [ %120, %ENDIF197 ], [ %116, %ELSE205 ], [ %118, %ELSE202 ]
+  %temp91.0 = phi float [ %121, %ENDIF197 ], [ %117, %ELSE205 ], [ %119, %ELSE202 ]
+  %594 = fcmp olt float %565, %83
+  %595 = sext i1 %594 to i32
+  %596 = bitcast i32 %595 to float
+  %597 = bitcast float %596 to i32
+  %598 = icmp ne i32 %597, 0
+  br i1 %598, label %ENDIF209, label %ELSE211
+
+ELSE205:                                          ; preds = %ELSE202
+  %599 = fcmp olt float %565, %81
+  %600 = sext i1 %599 to i32
+  %601 = bitcast i32 %600 to float
+  %602 = bitcast float %601 to i32
+  %603 = icmp ne i32 %602, 0
+  %.226 = select i1 %603, float %583, float 1.000000e+00
+  %.227 = select i1 %603, float %118, float %116
+  %.228 = select i1 %603, float %119, float %117
+  br label %ENDIF200
+
+ELSE211:                                          ; preds = %ENDIF200
+  %604 = fcmp olt float %565, %82
+  %605 = sext i1 %604 to i32
+  %606 = bitcast i32 %605 to float
+  %607 = bitcast float %606 to i32
+  %608 = icmp ne i32 %607, 0
+  br i1 %608, label %ENDIF209, label %ELSE214
+
+ENDIF209:                                         ; preds = %ELSE214, %ELSE211, %ENDIF200
+  %temp52.0 = phi float [ %108, %ENDIF200 ], [ %100, %ELSE214 ], [ %104, %ELSE211 ]
+  %temp53.0 = phi float [ %109, %ENDIF200 ], [ %101, %ELSE214 ], [ %105, %ELSE211 ]
+  %temp54.0 = phi float [ %110, %ENDIF200 ], [ %102, %ELSE214 ], [ %106, %ELSE211 ]
+  %temp55.0 = phi float [ %111, %ENDIF200 ], [ %103, %ELSE214 ], [ %107, %ELSE211 ]
+  %temp68.0 = phi float [ %112, %ENDIF200 ], [ %.230, %ELSE214 ], [ %108, %ELSE211 ]
+  %temp69.0 = phi float [ %113, %ENDIF200 ], [ %.231, %ELSE214 ], [ %109, %ELSE211 ]
+  %temp70.0 = phi float [ %114, %ENDIF200 ], [ %.232, %ELSE214 ], [ %110, %ELSE211 ]
+  %temp71.0 = phi float [ %115, %ENDIF200 ], [ %.233, %ELSE214 ], [ %111, %ELSE211 ]
+  %609 = fmul float %164, %85
+  %610 = fmul float %165, %86
+  %611 = fadd float %609, %610
+  %612 = fmul float %166, %87
+  %613 = fadd float %611, %612
+  %614 = fmul float %167, %88
+  %615 = fadd float %613, %614
+  %616 = fmul float %164, %89
+  %617 = fmul float %165, %90
+  %618 = fadd float %616, %617
+  %619 = fmul float %166, %91
+  %620 = fadd float %618, %619
+  %621 = fmul float %167, %92
+  %622 = fadd float %620, %621
+  %623 = fmul float %164, %93
+  %624 = fmul float %165, %94
+  %625 = fadd float %623, %624
+  %626 = fmul float %166, %95
+  %627 = fadd float %625, %626
+  %628 = fmul float %167, %96
+  %629 = fadd float %627, %628
+  %630 = fsub float -0.000000e+00, %78
+  %631 = fadd float 1.000000e+00, %630
+  %632 = call float @fabs(float %615)
+  %633 = call float @fabs(float %622)
+  %634 = fcmp oge float %631, %632
+  %635 = sext i1 %634 to i32
+  %636 = bitcast i32 %635 to float
+  %637 = bitcast float %636 to i32
+  %638 = and i32 %637, 1065353216
+  %639 = bitcast i32 %638 to float
+  %640 = fcmp oge float %631, %633
+  %641 = sext i1 %640 to i32
+  %642 = bitcast i32 %641 to float
+  %643 = bitcast float %642 to i32
+  %644 = and i32 %643, 1065353216
+  %645 = bitcast i32 %644 to float
+  %646 = fmul float %639, %645
+  %647 = fmul float %629, %646
+  %648 = fmul float %615, %temp68.0
+  %649 = fadd float %648, %temp70.0
+  %650 = fmul float %622, %temp69.0
+  %651 = fadd float %650, %temp71.0
+  %652 = fmul float %615, %temp52.0
+  %653 = fadd float %652, %temp54.0
+  %654 = fmul float %622, %temp53.0
+  %655 = fadd float %654, %temp55.0
+  %656 = fadd float %temp80.0, -1.000000e+00
+  %657 = fmul float %656, %77
+  %658 = fadd float %657, 1.000000e+00
+  %659 = call float @llvm.AMDIL.clamp.(float %658, float 0.000000e+00, float 1.000000e+00)
+  %660 = bitcast float %649 to i32
+  %661 = bitcast float %651 to i32
+  %662 = bitcast float 0.000000e+00 to i32
+  %663 = insertelement <4 x i32> undef, i32 %660, i32 0
+  %664 = insertelement <4 x i32> %663, i32 %661, i32 1
+  %665 = insertelement <4 x i32> %664, i32 %662, i32 2
+  %666 = insertelement <4 x i32> %665, i32 undef, i32 3
+  %667 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %666, <32 x i8> %127, <16 x i8> %129, i32 2)
+  %668 = extractelement <4 x float> %667, i32 0
+  %669 = extractelement <4 x float> %667, i32 1
+  %670 = bitcast float %653 to i32
+  %671 = bitcast float %655 to i32
+  %672 = bitcast float 0.000000e+00 to i32
+  %673 = insertelement <4 x i32> undef, i32 %670, i32 0
+  %674 = insertelement <4 x i32> %673, i32 %671, i32 1
+  %675 = insertelement <4 x i32> %674, i32 %672, i32 2
+  %676 = insertelement <4 x i32> %675, i32 undef, i32 3
+  %677 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %676, <32 x i8> %127, <16 x i8> %129, i32 2)
+  %678 = extractelement <4 x float> %677, i32 0
+  %679 = extractelement <4 x float> %677, i32 1
+  %680 = fsub float -0.000000e+00, %669
+  %681 = fadd float 1.000000e+00, %680
+  %682 = fsub float -0.000000e+00, %679
+  %683 = fadd float 1.000000e+00, %682
+  %684 = fmul float %681, 2.500000e-01
+  %685 = fmul float %683, 2.500000e-01
+  %686 = fsub float -0.000000e+00, %684
+  %687 = fadd float %668, %686
+  %688 = fsub float -0.000000e+00, %685
+  %689 = fadd float %678, %688
+  %690 = fmul float %647, %temp88.0
+  %691 = fadd float %690, %temp89.0
+  %692 = fmul float %647, %temp90.0
+  %693 = fadd float %692, %temp91.0
+  %694 = call float @llvm.AMDIL.clamp.(float %691, float 0.000000e+00, float 1.000000e+00)
+  %695 = call float @llvm.AMDIL.clamp.(float %693, float 0.000000e+00, float 1.000000e+00)
+  %696 = fsub float -0.000000e+00, %694
+  %697 = fadd float %668, %696
+  %698 = fsub float -0.000000e+00, %695
+  %699 = fadd float %678, %698
+  %700 = fmul float %668, %668
+  %701 = fmul float %678, %678
+  %702 = fsub float -0.000000e+00, %700
+  %703 = fadd float %687, %702
+  %704 = fsub float -0.000000e+00, %701
+  %705 = fadd float %689, %704
+  %706 = fcmp uge float %703, %75
+  %707 = select i1 %706, float %703, float %75
+  %708 = fcmp uge float %705, %75
+  %709 = select i1 %708, float %705, float %75
+  %710 = fmul float %697, %697
+  %711 = fadd float %710, %707
+  %712 = fmul float %699, %699
+  %713 = fadd float %712, %709
+  %714 = fdiv float 1.000000e+00, %711
+  %715 = fdiv float 1.000000e+00, %713
+  %716 = fmul float %707, %714
+  %717 = fmul float %709, %715
+  %718 = fcmp oge float %697, 0.000000e+00
+  %719 = sext i1 %718 to i32
+  %720 = bitcast i32 %719 to float
+  %721 = bitcast float %720 to i32
+  %722 = icmp ne i32 %721, 0
+  %.229 = select i1 %722, float 1.000000e+00, float %716
+  %723 = fcmp oge float %699, 0.000000e+00
+  %724 = sext i1 %723 to i32
+  %725 = bitcast i32 %724 to float
+  %726 = bitcast float %725 to i32
+  %727 = icmp ne i32 %726, 0
+  %temp28.0 = select i1 %727, float 1.000000e+00, float %717
+  %728 = call float @llvm.AMDGPU.lrp(float %659, float %temp28.0, float %.229)
+  %729 = call float @llvm.pow.f32(float %728, float %76)
+  %730 = fmul float %729, %79
+  %731 = fadd float %730, %80
+  %732 = call float @llvm.AMDIL.clamp.(float %731, float 0.000000e+00, float 1.000000e+00)
+  %733 = fmul float %732, %732
+  %734 = fmul float 2.000000e+00, %732
+  %735 = fsub float -0.000000e+00, %734
+  %736 = fadd float 3.000000e+00, %735
+  %737 = fmul float %733, %736
+  %738 = fmul float %548, %737
+  %739 = fmul float %549, %737
+  %740 = fmul float %550, %737
+  %741 = fmul float %738, %515
+  %742 = fadd float %741, %533
+  %743 = fmul float %739, %515
+  %744 = fadd float %743, %534
+  %745 = fmul float %740, %515
+  %746 = fadd float %745, %535
+  %747 = call float @llvm.AMDGPU.lrp(float %230, float %287, float 1.000000e+00)
+  %748 = call float @llvm.AMDGPU.lrp(float %37, float %298, float 1.000000e+00)
+  %749 = call float @llvm.AMDGPU.lrp(float %37, float %299, float 1.000000e+00)
+  %750 = call float @llvm.AMDGPU.lrp(float %37, float %300, float 1.000000e+00)
+  %751 = call float @llvm.AMDGPU.lrp(float %38, float %747, float 1.000000e+00)
+  %752 = fmul float %748, %751
+  %753 = fmul float %749, %751
+  %754 = fmul float %750, %751
+  %755 = fmul float %742, %752
+  %756 = fmul float %744, %753
+  %757 = fmul float %746, %754
+  %758 = fmul float %temp12.0, %216
+  %759 = fmul float %temp13.0, %217
+  %760 = fadd float %759, %758
+  %761 = fmul float %temp14.0, %218
+  %762 = fadd float %760, %761
+  %763 = call float @fabs(float %762)
+  %764 = fmul float %763, %763
+  %765 = fmul float %764, %50
+  %766 = fadd float %765, %51
+  %767 = call float @llvm.AMDIL.clamp.(float %766, float 0.000000e+00, float 1.000000e+00)
+  %768 = fsub float -0.000000e+00, %767
+  %769 = fadd float 1.000000e+00, %768
+  %770 = fmul float %33, %769
+  %771 = fmul float %33, %769
+  %772 = fmul float %33, %769
+  %773 = fmul float %34, %769
+  %774 = call float @llvm.AMDGPU.lrp(float %770, float %31, float %755)
+  %775 = call float @llvm.AMDGPU.lrp(float %771, float %31, float %756)
+  %776 = call float @llvm.AMDGPU.lrp(float %772, float %31, float %757)
+  %777 = call float @llvm.AMDGPU.lrp(float %773, float %32, float %374)
+  %778 = fcmp uge float %774, 0x3E6FFFFE60000000
+  %779 = select i1 %778, float %774, float 0x3E6FFFFE60000000
+  %780 = fcmp uge float %775, 0x3E6FFFFE60000000
+  %781 = select i1 %780, float %775, float 0x3E6FFFFE60000000
+  %782 = fcmp uge float %776, 0x3E6FFFFE60000000
+  %783 = select i1 %782, float %776, float 0x3E6FFFFE60000000
+  %784 = fcmp uge float %779, 6.550400e+04
+  %785 = select i1 %784, float 6.550400e+04, float %779
+  %786 = fcmp uge float %781, 6.550400e+04
+  %787 = select i1 %786, float 6.550400e+04, float %781
+  %788 = fcmp uge float %783, 6.550400e+04
+  %789 = select i1 %788, float 6.550400e+04, float %783
+  %790 = fmul float %777, %52
+  %791 = fadd float %790, %53
+  %792 = call float @llvm.AMDIL.clamp.(float %791, float 0.000000e+00, float 1.000000e+00)
+  %793 = call i32 @llvm.SI.packf16(float %785, float %787)
+  %794 = bitcast i32 %793 to float
+  %795 = call i32 @llvm.SI.packf16(float %789, float %792)
+  %796 = bitcast i32 %795 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %794, float %796, float %794, float %796)
+  ret void
+
+ELSE214:                                          ; preds = %ELSE211
+  %797 = fcmp olt float %565, %81
+  %798 = sext i1 %797 to i32
+  %799 = bitcast i32 %798 to float
+  %800 = bitcast float %799 to i32
+  %801 = icmp ne i32 %800, 0
+  %.230 = select i1 %801, float %104, float %100
+  %.231 = select i1 %801, float %105, float %101
+  %.232 = select i1 %801, float %106, float %102
+  %.233 = select i1 %801, float %107, float %103
+  br label %ENDIF209
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.clamp.(float, float, float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.lrp(float, float, float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.samplel.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.cndlt(float, float, float) #2
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.exp.(float) #2
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
+attributes #3 = { nounwind readonly }
+attributes #4 = { readonly }
diff --git a/test/CodeGen/AMDGPU/si-spill-cf.ll b/test/CodeGen/AMDGPU/si-spill-cf.ll
new file mode 100644
index 00000000000..4b2d8ec6bf0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-spill-cf.ll
@@ -0,0 +1,501 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s
+
+; If this occurs it is likely due to reordering and the restore was
+; originally supposed to happen before SI_END_CF.
+; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]]
+; SI-NOT: v_readlane_b32 [[SAVED]]
+
+define void @main() #0 {
+main_body:
+  %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
+  %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
+  %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80)
+  %3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84)
+  %4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88)
+  %5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
+  %6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100)
+  %7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104)
+  %8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112)
+  %9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116)
+  %10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120)
+  %11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128)
+  %12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132)
+  %13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136)
+  %14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144)
+  %15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148)
+  %16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152)
+  %17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160)
+  %18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164)
+  %19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168)
+  %20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176)
+  %21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180)
+  %22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184)
+  %23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192)
+  %24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196)
+  %25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200)
+  %26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208)
+  %27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212)
+  %28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216)
+  %29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224)
+  %30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228)
+  %31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232)
+  %32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240)
+  %33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244)
+  %34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248)
+  %35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256)
+  %36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260)
+  %37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264)
+  %38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272)
+  %39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276)
+  %40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280)
+  %41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288)
+  %42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292)
+  %43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296)
+  %44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304)
+  %45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308)
+  %46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312)
+  %47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320)
+  %48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324)
+  %49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328)
+  %50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336)
+  %51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340)
+  %52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344)
+  %53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352)
+  %54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356)
+  %55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360)
+  %56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368)
+  %57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372)
+  %58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376)
+  %59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384)
+  %60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388)
+  %61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392)
+  %62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400)
+  %63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404)
+  %64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408)
+  %65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416)
+  %66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420)
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF2795, %main_body
+  %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ]
+  %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ]
+  %67 = icmp sgt i32 undef, 4
+  br i1 %67, label %ENDLOOP, label %ENDIF
+
+ENDLOOP:                                          ; preds = %ELSE2566, %LOOP
+  %68 = call float @llvm.AMDGPU.lrp(float %0, float undef, float undef)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %68, float undef, float 1.000000e+00)
+  ret void
+
+ENDIF:                                            ; preds = %LOOP
+  %69 = fsub float %2, undef
+  %70 = fsub float %3, undef
+  %71 = fsub float %4, undef
+  %72 = fmul float %69, 0.000000e+00
+  %73 = fmul float %70, undef
+  %74 = fmul float %71, undef
+  %75 = fsub float %6, undef
+  %76 = fsub float %7, undef
+  %77 = fmul float %75, undef
+  %78 = fmul float %76, 0.000000e+00
+  %79 = call float @llvm.minnum.f32(float %74, float %78)
+  %80 = call float @llvm.maxnum.f32(float %72, float 0.000000e+00)
+  %81 = call float @llvm.maxnum.f32(float %73, float %77)
+  %82 = call float @llvm.maxnum.f32(float undef, float %79)
+  %83 = call float @llvm.minnum.f32(float %80, float %81)
+  %84 = call float @llvm.minnum.f32(float %83, float undef)
+  %85 = fsub float %14, undef
+  %86 = fsub float %15, undef
+  %87 = fsub float %16, undef
+  %88 = fmul float %85, undef
+  %89 = fmul float %86, undef
+  %90 = fmul float %87, undef
+  %91 = fsub float %17, undef
+  %92 = fsub float %18, undef
+  %93 = fsub float %19, undef
+  %94 = fmul float %91, 0.000000e+00
+  %95 = fmul float %92, undef
+  %96 = fmul float %93, undef
+  %97 = call float @llvm.minnum.f32(float %89, float %95)
+  %98 = call float @llvm.maxnum.f32(float %88, float %94)
+  %99 = call float @llvm.maxnum.f32(float %90, float %96)
+  %100 = call float @llvm.maxnum.f32(float undef, float %97)
+  %101 = call float @llvm.maxnum.f32(float %100, float undef)
+  %102 = call float @llvm.minnum.f32(float %98, float undef)
+  %103 = call float @llvm.minnum.f32(float %102, float %99)
+  %104 = fsub float %30, undef
+  %105 = fsub float %31, undef
+  %106 = fmul float %104, 0.000000e+00
+  %107 = fmul float %105, 0.000000e+00
+  %108 = call float @llvm.minnum.f32(float undef, float %106)
+  %109 = call float @llvm.maxnum.f32(float undef, float %107)
+  %110 = call float @llvm.maxnum.f32(float undef, float %108)
+  %111 = call float @llvm.maxnum.f32(float %110, float undef)
+  %112 = call float @llvm.minnum.f32(float undef, float %109)
+  %113 = fsub float %32, undef
+  %114 = fsub float %33, undef
+  %115 = fsub float %34, undef
+  %116 = fmul float %113, 0.000000e+00
+  %117 = fmul float %114, undef
+  %118 = fmul float %115, undef
+  %119 = fsub float %35, undef
+  %120 = fsub float %36, undef
+  %121 = fsub float %37, undef
+  %122 = fmul float %119, undef
+  %123 = fmul float %120, undef
+  %124 = fmul float %121, undef
+  %125 = call float @llvm.minnum.f32(float %116, float %122)
+  %126 = call float @llvm.minnum.f32(float %117, float %123)
+  %127 = call float @llvm.minnum.f32(float %118, float %124)
+  %128 = call float @llvm.maxnum.f32(float %125, float %126)
+  %129 = call float @llvm.maxnum.f32(float %128, float %127)
+  %130 = fsub float %38, undef
+  %131 = fsub float %39, undef
+  %132 = fsub float %40, undef
+  %133 = fmul float %130, 0.000000e+00
+  %134 = fmul float %131, undef
+  %135 = fmul float %132, undef
+  %136 = fsub float %41, undef
+  %137 = fsub float %42, undef
+  %138 = fsub float %43, undef
+  %139 = fmul float %136, undef
+  %140 = fmul float %137, undef
+  %141 = fmul float %138, undef
+  %142 = call float @llvm.minnum.f32(float %133, float %139)
+  %143 = call float @llvm.minnum.f32(float %134, float %140)
+  %144 = call float @llvm.minnum.f32(float %135, float %141)
+  %145 = call float @llvm.maxnum.f32(float %142, float %143)
+  %146 = call float @llvm.maxnum.f32(float %145, float %144)
+  %147 = fsub float %44, undef
+  %148 = fsub float %45, undef
+  %149 = fsub float %46, undef
+  %150 = fmul float %147, 0.000000e+00
+  %151 = fmul float %148, 0.000000e+00
+  %152 = fmul float %149, undef
+  %153 = fsub float %47, undef
+  %154 = fsub float %48, undef
+  %155 = fsub float %49, undef
+  %156 = fmul float %153, undef
+  %157 = fmul float %154, 0.000000e+00
+  %158 = fmul float %155, undef
+  %159 = call float @llvm.minnum.f32(float %150, float %156)
+  %160 = call float @llvm.minnum.f32(float %151, float %157)
+  %161 = call float @llvm.minnum.f32(float %152, float %158)
+  %162 = call float @llvm.maxnum.f32(float %159, float %160)
+  %163 = call float @llvm.maxnum.f32(float %162, float %161)
+  %164 = fsub float %50, undef
+  %165 = fsub float %51, undef
+  %166 = fsub float %52, undef
+  %167 = fmul float %164, undef
+  %168 = fmul float %165, 0.000000e+00
+  %169 = fmul float %166, 0.000000e+00
+  %170 = fsub float %53, undef
+  %171 = fsub float %54, undef
+  %172 = fsub float %55, undef
+  %173 = fdiv float 1.000000e+00, %temp18.0
+  %174 = fmul float %170, undef
+  %175 = fmul float %171, undef
+  %176 = fmul float %172, %173
+  %177 = call float @llvm.minnum.f32(float %167, float %174)
+  %178 = call float @llvm.minnum.f32(float %168, float %175)
+  %179 = call float @llvm.minnum.f32(float %169, float %176)
+  %180 = call float @llvm.maxnum.f32(float %177, float %178)
+  %181 = call float @llvm.maxnum.f32(float %180, float %179)
+  %182 = fsub float %62, undef
+  %183 = fsub float %63, undef
+  %184 = fsub float %64, undef
+  %185 = fmul float %182, 0.000000e+00
+  %186 = fmul float %183, undef
+  %187 = fmul float %184, undef
+  %188 = fsub float %65, undef
+  %189 = fsub float %66, undef
+  %190 = fmul float %188, undef
+  %191 = fmul float %189, undef
+  %192 = call float @llvm.maxnum.f32(float %185, float %190)
+  %193 = call float @llvm.maxnum.f32(float %186, float %191)
+  %194 = call float @llvm.maxnum.f32(float %187, float undef)
+  %195 = call float @llvm.minnum.f32(float %192, float %193)
+  %196 = call float @llvm.minnum.f32(float %195, float %194)
+  %.temp292.7 = select i1 undef, float %163, float undef
+  %temp292.9 = select i1 false, float %181, float %.temp292.7
+  %.temp292.9 = select i1 undef, float undef, float %temp292.9
+  %197 = fcmp ogt float undef, 0.000000e+00
+  %198 = fcmp olt float undef, %196
+  %199 = and i1 %197, %198
+  %200 = fcmp olt float undef, %.temp292.9
+  %201 = and i1 %199, %200
+  %temp292.11 = select i1 %201, float undef, float %.temp292.9
+  br i1 undef, label %IF2565, label %ELSE2566
+
+IF2565:                                           ; preds = %ENDIF
+  br i1 false, label %ENDIF2582, label %ELSE2584
+
+ELSE2566:                                         ; preds = %ENDIF
+  %202 = fcmp oeq float %temp292.11, 1.000000e+04
+  br i1 %202, label %ENDLOOP, label %ELSE2593
+
+ENDIF2564:                                        ; preds = %ENDIF2594, %ENDIF2588
+  %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ]
+  %temp18.1 = phi float [ %219, %ENDIF2588 ], [ undef, %ENDIF2594 ]
+  %203 = fsub float %5, undef
+  %204 = fmul float %203, undef
+  %205 = call float @llvm.maxnum.f32(float undef, float %204)
+  %206 = call float @llvm.minnum.f32(float %205, float undef)
+  %207 = call float @llvm.minnum.f32(float %206, float undef)
+  %208 = fcmp ogt float undef, 0.000000e+00
+  %209 = fcmp olt float undef, 1.000000e+00
+  %210 = and i1 %208, %209
+  %211 = fcmp olt float undef, %207
+  %212 = and i1 %210, %211
+  br i1 %212, label %ENDIF2795, label %ELSE2797
+
+ELSE2584:                                         ; preds = %IF2565
+  br label %ENDIF2582
+
+ENDIF2582:                                        ; preds = %ELSE2584, %IF2565
+  %213 = fadd float %1, undef
+  %214 = fadd float 0.000000e+00, %213
+  %215 = call float @llvm.AMDIL.fraction.(float %214)
+  br i1 undef, label %IF2589, label %ELSE2590
+
+IF2589:                                           ; preds = %ENDIF2582
+  br label %ENDIF2588
+
+ELSE2590:                                         ; preds = %ENDIF2582
+  br label %ENDIF2588
+
+ENDIF2588:                                        ; preds = %ELSE2590, %IF2589
+  %216 = fsub float 1.000000e+00, %215
+  %217 = call float @llvm.sqrt.f32(float %216)
+  %218 = fmul float %217, undef
+  %219 = fadd float %218, undef
+  br label %ENDIF2564
+
+ELSE2593:                                         ; preds = %ELSE2566
+  %220 = fcmp oeq float %temp292.11, %82
+  %221 = fcmp olt float %82, %84
+  %222 = and i1 %220, %221
+  br i1 %222, label %ENDIF2594, label %ELSE2596
+
+ELSE2596:                                         ; preds = %ELSE2593
+  %223 = fcmp oeq float %temp292.11, %101
+  %224 = fcmp olt float %101, %103
+  %225 = and i1 %223, %224
+  br i1 %225, label %ENDIF2594, label %ELSE2632
+
+ENDIF2594:                                        ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593
+  %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ]
+  %226 = fmul float %temp894.2, undef
+  br label %ENDIF2564
+
+ELSE2632:                                         ; preds = %ELSE2596
+  br i1 undef, label %ENDIF2594, label %ELSE2650
+
+ELSE2650:                                         ; preds = %ELSE2632
+  %227 = fcmp oeq float %temp292.11, %111
+  %228 = fcmp olt float %111, %112
+  %229 = and i1 %227, %228
+  br i1 %229, label %IF2667, label %ELSE2668
+
+IF2667:                                           ; preds = %ELSE2650
+  br i1 undef, label %ENDIF2594, label %ELSE2671
+
+ELSE2668:                                         ; preds = %ELSE2650
+  %230 = fcmp oeq float %temp292.11, %129
+  %231 = fcmp olt float %129, undef
+  %232 = and i1 %230, %231
+  br i1 %232, label %ENDIF2594, label %ELSE2686
+
+ELSE2671:                                         ; preds = %IF2667
+  br label %ENDIF2594
+
+ELSE2686:                                         ; preds = %ELSE2668
+  %233 = fcmp oeq float %temp292.11, %146
+  %234 = fcmp olt float %146, undef
+  %235 = and i1 %233, %234
+  br i1 %235, label %ENDIF2594, label %ELSE2704
+
+ELSE2704:                                         ; preds = %ELSE2686
+  %236 = fcmp oeq float %temp292.11, %181
+  %237 = fcmp olt float %181, undef
+  %238 = and i1 %236, %237
+  br i1 %238, label %ENDIF2594, label %ELSE2740
+
+ELSE2740:                                         ; preds = %ELSE2704
+  br i1 undef, label %IF2757, label %ELSE2758
+
+IF2757:                                           ; preds = %ELSE2740
+  br i1 undef, label %ENDIF2594, label %ELSE2761
+
+ELSE2758:                                         ; preds = %ELSE2740
+  br i1 undef, label %IF2775, label %ENDIF2594
+
+ELSE2761:                                         ; preds = %IF2757
+  br label %ENDIF2594
+
+IF2775:                                           ; preds = %ELSE2758
+  %239 = fcmp olt float undef, undef
+  br i1 %239, label %ENDIF2594, label %ELSE2779
+
+ELSE2779:                                         ; preds = %IF2775
+  br i1 undef, label %ENDIF2594, label %ELSE2782
+
+ELSE2782:                                         ; preds = %ELSE2779
+  br i1 undef, label %ENDIF2594, label %ELSE2785
+
+ELSE2785:                                         ; preds = %ELSE2782
+  %240 = fcmp olt float undef, 0.000000e+00
+  br i1 %240, label %ENDIF2594, label %ELSE2788
+
+ELSE2788:                                         ; preds = %ELSE2785
+  %241 = fcmp olt float 0.000000e+00, undef
+  %.2848 = select i1 %241, float -1.000000e+00, float 1.000000e+00
+  br label %ENDIF2594
+
+ELSE2797:                                         ; preds = %ENDIF2564
+  %242 = fsub float %8, undef
+  %243 = fsub float %9, undef
+  %244 = fsub float %10, undef
+  %245 = fmul float %242, undef
+  %246 = fmul float %243, undef
+  %247 = fmul float %244, undef
+  %248 = fsub float %11, undef
+  %249 = fsub float %12, undef
+  %250 = fsub float %13, undef
+  %251 = fmul float %248, undef
+  %252 = fmul float %249, undef
+  %253 = fmul float %250, undef
+  %254 = call float @llvm.minnum.f32(float %245, float %251)
+  %255 = call float @llvm.minnum.f32(float %246, float %252)
+  %256 = call float @llvm.maxnum.f32(float %247, float %253)
+  %257 = call float @llvm.maxnum.f32(float %254, float %255)
+  %258 = call float @llvm.maxnum.f32(float %257, float undef)
+  %259 = call float @llvm.minnum.f32(float undef, float %256)
+  %260 = fcmp ogt float %258, 0.000000e+00
+  %261 = fcmp olt float %258, 1.000000e+00
+  %262 = and i1 %260, %261
+  %263 = fcmp olt float %258, %259
+  %264 = and i1 %262, %263
+  br i1 %264, label %ENDIF2795, label %ELSE2800
+
+ENDIF2795:                                        ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564
+  br label %LOOP
+
+ELSE2800:                                         ; preds = %ELSE2797
+  br i1 undef, label %ENDIF2795, label %ELSE2803
+
+ELSE2803:                                         ; preds = %ELSE2800
+  %265 = fsub float %20, undef
+  %266 = fsub float %21, undef
+  %267 = fsub float %22, undef
+  %268 = fmul float %265, undef
+  %269 = fmul float %266, undef
+  %270 = fmul float %267, 0.000000e+00
+  %271 = fsub float %23, undef
+  %272 = fsub float %24, undef
+  %273 = fsub float %25, undef
+  %274 = fmul float %271, undef
+  %275 = fmul float %272, undef
+  %276 = fmul float %273, undef
+  %277 = call float @llvm.minnum.f32(float %268, float %274)
+  %278 = call float @llvm.maxnum.f32(float %269, float %275)
+  %279 = call float @llvm.maxnum.f32(float %270, float %276)
+  %280 = call float @llvm.maxnum.f32(float %277, float undef)
+  %281 = call float @llvm.maxnum.f32(float %280, float undef)
+  %282 = call float @llvm.minnum.f32(float undef, float %278)
+  %283 = call float @llvm.minnum.f32(float %282, float %279)
+  %284 = fcmp ogt float %281, 0.000000e+00
+  %285 = fcmp olt float %281, 1.000000e+00
+  %286 = and i1 %284, %285
+  %287 = fcmp olt float %281, %283
+  %288 = and i1 %286, %287
+  br i1 %288, label %ENDIF2795, label %ELSE2806
+
+ELSE2806:                                         ; preds = %ELSE2803
+  %289 = fsub float %26, undef
+  %290 = fsub float %27, undef
+  %291 = fsub float %28, undef
+  %292 = fmul float %289, undef
+  %293 = fmul float %290, 0.000000e+00
+  %294 = fmul float %291, undef
+  %295 = fsub float %29, undef
+  %296 = fmul float %295, undef
+  %297 = call float @llvm.minnum.f32(float %292, float %296)
+  %298 = call float @llvm.minnum.f32(float %293, float undef)
+  %299 = call float @llvm.maxnum.f32(float %294, float undef)
+  %300 = call float @llvm.maxnum.f32(float %297, float %298)
+  %301 = call float @llvm.maxnum.f32(float %300, float undef)
+  %302 = call float @llvm.minnum.f32(float undef, float %299)
+  %303 = fcmp ogt float %301, 0.000000e+00
+  %304 = fcmp olt float %301, 1.000000e+00
+  %305 = and i1 %303, %304
+  %306 = fcmp olt float %301, %302
+  %307 = and i1 %305, %306
+  br i1 %307, label %ENDIF2795, label %ELSE2809
+
+ELSE2809:                                         ; preds = %ELSE2806
+  br i1 undef, label %ENDIF2795, label %ELSE2812
+
+ELSE2812:                                         ; preds = %ELSE2809
+  br i1 undef, label %ENDIF2795, label %ELSE2815
+
+ELSE2815:                                         ; preds = %ELSE2812
+  br i1 undef, label %ENDIF2795, label %ELSE2818
+
+ELSE2818:                                         ; preds = %ELSE2815
+  br i1 undef, label %ENDIF2795, label %ELSE2821
+
+ELSE2821:                                         ; preds = %ELSE2818
+  %308 = fsub float %56, undef
+  %309 = fsub float %57, undef
+  %310 = fsub float %58, undef
+  %311 = fmul float %308, undef
+  %312 = fmul float %309, 0.000000e+00
+  %313 = fmul float %310, undef
+  %314 = fsub float %59, undef
+  %315 = fsub float %60, undef
+  %316 = fsub float %61, undef
+  %317 = fmul float %314, undef
+  %318 = fmul float %315, undef
+  %319 = fmul float %316, undef
+  %320 = call float @llvm.maxnum.f32(float %311, float %317)
+  %321 = call float @llvm.maxnum.f32(float %312, float %318)
+  %322 = call float @llvm.maxnum.f32(float %313, float %319)
+  %323 = call float @llvm.minnum.f32(float %320, float %321)
+  %324 = call float @llvm.minnum.f32(float %323, float %322)
+  %325 = fcmp ogt float undef, 0.000000e+00
+  %326 = fcmp olt float undef, 1.000000e+00
+  %327 = and i1 %325, %326
+  %328 = fcmp olt float undef, %324
+  %329 = and i1 %327, %328
+  br i1 %329, label %ENDIF2795, label %ELSE2824
+
+ELSE2824:                                         ; preds = %ELSE2821
+  %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00
+  br label %ENDIF2795
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.fraction.(float) #2
+
+; Function Attrs: nounwind readnone
+declare float @llvm.sqrt.f32(float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.minnum.f32(float, float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.maxnum.f32(float, float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.lrp(float, float, float) #2
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
new file mode 100644
index 00000000000..5a6129aaa3f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -0,0 +1,236 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
+
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.AMDGPU.barrier.local() #2
+
+
+@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4
+@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8
+@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8
+
+; FUNC-LABEL: @reorder_local_load_global_store_local_load
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: buffer_store_dword
+define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+
+  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; CI: buffer_store_dword
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+
+  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
+  store volatile i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: buffer_store_dword
+define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+
+  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  call void @llvm.AMDGPU.barrier.local() #2
+  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Technically we could reorder these, but just comparing the
+; instruction type of the load is insufficient.
+
+; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load
+; CI: buffer_load_dword
+; CI: buffer_store_dword
+; CI: buffer_load_dword
+; CI: buffer_store_dword
+define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
+
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+
+  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_constant_load_local_store_constant_load
+; CI: buffer_load_dword
+; CI: buffer_load_dword
+; CI: ds_write_b32
+; CI: buffer_store_dword
+define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
+  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
+
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+
+  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
+  store i32 99, i32 addrspace(3)* %lptr, align 4
+  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_smrd_load_local_store_smrd_load
+; CI: s_load_dword
+; CI: s_load_dword
+; CI: s_load_dword
+; CI: ds_write_b32
+; CI: buffer_store_dword
+define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+
+  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
+  store i32 99, i32 addrspace(3)* %lptr, align 4
+  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_global_load_local_store_global_load
+; CI: buffer_load_dword
+; CI: buffer_load_dword
+; CI: ds_write_b32
+; CI: buffer_store_dword
+define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 2
+
+  %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4
+  store i32 99, i32 addrspace(3)* %lptr, align 4
+  %tmp2 = load i32, i32 addrspace(1)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_local_offsets
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
+; CI: buffer_store_dword
+; CI: s_endpgm
+define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100
+  %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 101
+
+  store i32 123, i32 addrspace(3)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(3)* %ptr3, align 4
+  store i32 123, i32 addrspace(3)* %ptr2, align 4
+  %tmp3 = load i32, i32 addrspace(3)* %ptr1, align 4
+  store i32 789, i32 addrspace(3)* %ptr3, align 4
+
+  %add.0 = add nsw i32 %tmp2, %tmp1
+  %add.1 = add nsw i32 %add.0, %tmp3
+  store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_global_offsets
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
+; CI: buffer_store_dword
+; CI: s_endpgm
+define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100
+  %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 101
+
+  store i32 123, i32 addrspace(1)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(1)* %ptr3, align 4
+  store i32 123, i32 addrspace(1)* %ptr2, align 4
+  %tmp3 = load i32, i32 addrspace(1)* %ptr1, align 4
+  store i32 789, i32 addrspace(1)* %ptr3, align 4
+
+  %add.0 = add nsw i32 %tmp2, %tmp1
+  %add.1 = add nsw i32 %add.0, %tmp3
+  store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XFUNC-LABEL: @reorder_local_load_tbuffer_store_local_load
+; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4
+; XCI: TBUFFER_STORE_FORMAT
+; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8
+; define void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #1 {
+;   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+;   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
+;   %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+
+;   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
+
+;   %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+;   call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+;         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
+;         i32 1, i32 0)
+
+;   %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
+
+;   %add = add nsw i32 %tmp1, %tmp2
+
+;   store i32 %add, i32 addrspace(1)* %out, align 4
+;   ret void
+; }
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { nounwind noduplicate }
diff --git a/test/CodeGen/AMDGPU/si-vector-hang.ll b/test/CodeGen/AMDGPU/si-vector-hang.ll
new file mode 100644
index 00000000000..bd427dd3ed4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-vector-hang.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK: {{^}}test_8_min_char:
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; ModuleID = 'radeon'
+
+define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
+entry:
+  %0 = load i8, i8 addrspace(1)* %in0, align 1
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %arrayidx2.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 1
+  %2 = load i8, i8 addrspace(1)* %arrayidx2.i.i, align 1
+  %3 = insertelement <8 x i8> %1, i8 %2, i32 1
+  %arrayidx6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 2
+  %4 = load i8, i8 addrspace(1)* %arrayidx6.i.i, align 1
+  %5 = insertelement <8 x i8> %3, i8 %4, i32 2
+  %arrayidx10.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 3
+  %6 = load i8, i8 addrspace(1)* %arrayidx10.i.i, align 1
+  %7 = insertelement <8 x i8> %5, i8 %6, i32 3
+  %arrayidx.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 4
+  %8 = load i8, i8 addrspace(1)* %arrayidx.i.i, align 1
+  %9 = insertelement <8 x i8> undef, i8 %8, i32 0
+  %arrayidx2.i9.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 5
+  %10 = load i8, i8 addrspace(1)* %arrayidx2.i9.i, align 1
+  %11 = insertelement <8 x i8> %9, i8 %10, i32 1
+  %arrayidx6.i11.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 6
+  %12 = load i8, i8 addrspace(1)* %arrayidx6.i11.i, align 1
+  %13 = insertelement <8 x i8> %11, i8 %12, i32 2
+  %arrayidx10.i13.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 7
+  %14 = load i8, i8 addrspace(1)* %arrayidx10.i13.i, align 1
+  %15 = insertelement <8 x i8> %13, i8 %14, i32 3
+  %vecinit5.i = shufflevector <8 x i8> %7, <8 x i8> %15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  %16 = load i8, i8 addrspace(1)* %in1, align 1
+  %17 = insertelement <8 x i8> undef, i8 %16, i32 0
+  %arrayidx2.i.i4 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 1
+  %18 = load i8, i8 addrspace(1)* %arrayidx2.i.i4, align 1
+  %19 = insertelement <8 x i8> %17, i8 %18, i32 1
+  %arrayidx6.i.i5 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 2
+  %20 = load i8, i8 addrspace(1)* %arrayidx6.i.i5, align 1
+  %21 = insertelement <8 x i8> %19, i8 %20, i32 2
+  %arrayidx10.i.i6 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 3
+  %22 = load i8, i8 addrspace(1)* %arrayidx10.i.i6, align 1
+  %23 = insertelement <8 x i8> %21, i8 %22, i32 3
+  %arrayidx.i.i7 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 4
+  %24 = load i8, i8 addrspace(1)* %arrayidx.i.i7, align 1
+  %25 = insertelement <8 x i8> undef, i8 %24, i32 0
+  %arrayidx2.i9.i8 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 5
+  %26 = load i8, i8 addrspace(1)* %arrayidx2.i9.i8, align 1
+  %27 = insertelement <8 x i8> %25, i8 %26, i32 1
+  %arrayidx6.i11.i9 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 6
+  %28 = load i8, i8 addrspace(1)* %arrayidx6.i11.i9, align 1
+  %29 = insertelement <8 x i8> %27, i8 %28, i32 2
+  %arrayidx10.i13.i10 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 7
+  %30 = load i8, i8 addrspace(1)* %arrayidx10.i13.i10, align 1
+  %31 = insertelement <8 x i8> %29, i8 %30, i32 3
+  %vecinit5.i11 = shufflevector <8 x i8> %23, <8 x i8> %31, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  %cmp.i = icmp slt <8 x i8> %vecinit5.i, %vecinit5.i11
+  %cond.i = select <8 x i1> %cmp.i, <8 x i8> %vecinit5.i, <8 x i8> %vecinit5.i11
+  %32 = extractelement <8 x i8> %cond.i, i32 0
+  store i8 %32, i8 addrspace(1)* %out, align 1
+  %33 = extractelement <8 x i8> %cond.i, i32 1
+  %arrayidx2.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
+  store i8 %33, i8 addrspace(1)* %arrayidx2.i.i.i, align 1
+  %34 = extractelement <8 x i8> %cond.i, i32 2
+  %arrayidx.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 2
+  store i8 %34, i8 addrspace(1)* %arrayidx.i.i.i, align 1
+  %35 = extractelement <8 x i8> %cond.i, i32 3
+  %arrayidx2.i6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 3
+  store i8 %35, i8 addrspace(1)* %arrayidx2.i6.i.i, align 1
+  %arrayidx.i.i3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4
+  %36 = extractelement <8 x i8> %cond.i, i32 4
+  store i8 %36, i8 addrspace(1)* %arrayidx.i.i3, align 1
+  %37 = extractelement <8 x i8> %cond.i, i32 5
+  %arrayidx2.i.i6.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 5
+  store i8 %37, i8 addrspace(1)* %arrayidx2.i.i6.i, align 1
+  %38 = extractelement <8 x i8> %cond.i, i32 6
+  %arrayidx.i.i7.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 6
+  store i8 %38, i8 addrspace(1)* %arrayidx.i.i7.i, align 1
+  %39 = extractelement <8 x i8> %cond.i, i32 7
+  %arrayidx2.i6.i8.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 7
+  store i8 %39, i8 addrspace(1)* %arrayidx2.i6.i8.i, align 1
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
+
+!0 = !{null}
+!1 = !{null}
+!2 = !{null}
+!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @test_8_min_char}
+!4 = !{null}
+!5 = !{null}
+!6 = !{null}
+!7 = !{null}
+!8 = !{null}
diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll
new file mode 100644
index 00000000000..06bee114c23
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sign_extend.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_sext_i1_to_i32:
+; SI: v_cndmask_b32_e64
+; SI: s_endpgm
+define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp eq i32 %a, %b
+  %sext = sext i1 %cmp to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_s_sext_i32_to_i64:
+; SI: s_ashr_i32
+; SI: s_endpg
+define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
+entry:
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, %c
+  %sext = sext i32 %add to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}s_sext_i1_to_i64:
+; SI: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc
+; SI: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
+; SI: s_endpgm
+define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp eq i32 %a, %b
+  %sext = sext i1 %cmp to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}s_sext_i32_to_i64:
+; SI: s_ashr_i32
+; SI: s_endpgm
+define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
+  %sext = sext i32 %a to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}v_sext_i32_to_i64:
+; SI: v_ashr
+; SI: s_endpgm
+define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %sext = sext i32 %val to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}s_sext_i16_to_i64:
+; SI: s_endpgm
+define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
+  %sext = sext i16 %a to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/simplify-demanded-bits-build-pair.ll b/test/CodeGen/AMDGPU/simplify-demanded-bits-build-pair.ll
new file mode 100644
index 00000000000..dffee70b6b0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/simplify-demanded-bits-build-pair.ll
@@ -0,0 +1,39 @@
+; XFAIL: *
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s
+
+; 64-bit select was originally lowered with a build_pair, and this
+; could be simplified to 1 cndmask instead of 2, but that broken when
+; it started being implemented with a v2i32 build_vector and
+; bitcasting.
+define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, i64 %a, i64 %b
+  %trunc = trunc i64 %select to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: Fix truncating store for local memory
+; SI-LABEL: {{^}}trunc_load_alloca_i64:
+; SI: v_movrels_b32
+; SI-NOT: v_movrels_b32
+; SI: s_endpgm
+define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) {
+  %idx = add i32 %a, %b
+  %alloca = alloca i64, i32 4
+  %gep0 = getelementptr i64, i64* %alloca, i64 0
+  %gep1 = getelementptr i64, i64* %alloca, i64 1
+  %gep2 = getelementptr i64, i64* %alloca, i64 2
+  %gep3 = getelementptr i64, i64* %alloca, i64 3
+  store i64 24, i64* %gep0, align 8
+  store i64 9334, i64* %gep1, align 8
+  store i64 3935, i64* %gep2, align 8
+  store i64 9342, i64* %gep3, align 8
+  %gep = getelementptr i64, i64* %alloca, i32 %idx
+  %load = load i64, i64* %gep, align 8
+  %mask = and i64 %load, 4294967296
+  %add = add i64 %mask, -1
+  store i64 %add, i64 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
new file mode 100644
index 00000000000..da4e91db3a3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -0,0 +1,61 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: {{^}}sint_to_fp_i32_to_f64
+; SI: v_cvt_f64_i32_e32
+define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+  %result = sitofp i32 %in to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; FIXME: select on 0, 0
+; SI-LABEL: {{^}}sint_to_fp_i1_f64:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
+; uses an SGPR for [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = sitofp i1 %cmp to double
+  store double %fp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}sint_to_fp_i1_f64_load:
+; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, -1
+; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
+  %fp = sitofp i1 %in to double
+  store double %fp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @s_sint_to_fp_i64_to_f64
+define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+  %result = sitofp i64 %in to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: @v_sint_to_fp_i64_to_f64
+; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; SI: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
+; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
+; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep, align 8
+  %result = sitofp i64 %val to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.ll b/test/CodeGen/AMDGPU/sint_to_fp.ll
new file mode 100644
index 00000000000..8506441d136
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sint_to_fp.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}s_sint_to_fp_i32_to_f32:
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}}
+define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) {
+  %result = sitofp i32 %in to float
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sint_to_fp_v2i32:
+; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
+
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+  %result = sitofp <2 x i32> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sint_to_fp_v4i32:
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %result = sitofp <4 x i32> %value to <4 x float>
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sint_to_fp_i1_f32:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = uitofp i1 %cmp to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sint_to_fp_i1_f32_load:
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) {
+  %fp = sitofp i1 %in to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
new file mode 100644
index 00000000000..b0c18ca5959
--- /dev/null
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -0,0 +1,111 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
+
+; SMRD load with an immediate offset.
+; GCN-LABEL: {{^}}smrd0:
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
+define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with the largest possible immediate offset.
+; GCN-LABEL: {{^}}smrd1:
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
+define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with an offset greater than the largest possible immediate.
+; GCN-LABEL: {{^}}smrd2:
+; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
+; GCN: s_endpgm
+define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with a 64-bit offset
+; GCN-LABEL: {{^}}smrd3:
+; FIXME: There are too many copies here because we don't fold immediates
+;        through REG_SEQUENCE
+; SI: s_mov_b32 s[[SLO:[0-9]+]], 0 ;
+; SI: s_mov_b32 s[[SHI:[0-9]+]], 4
+; SI: s_mov_b32 s[[SSLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SSLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; FIXME: We should be able to use s_load_dword here
+; SI: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64
+; TODO: Add VI checks
+; GCN: s_endpgm
+define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load using the load.const intrinsic with an immediate offset
+; GCN-LABEL: {{^}}smrd_load_const0:
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
+define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  ret void
+}
+
+; SMRD load using the load.const intrinsic with the largest possible immediate
+; offset.
+; GCN-LABEL: {{^}}smrd_load_const1:
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
+define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  ret void
+}
+; SMRD load using the load.const intrinsic with an offset greater than the
+; largets possible immediate.
+; immediate offset.
+; GCN-LABEL: {{^}}smrd_load_const2:
+; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
+define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
new file mode 100644
index 00000000000..46409cdfae1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
@@ -0,0 +1,48 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() readnone
+
+; This is broken because the low half of the 64-bit add remains on the
+; SALU, but the upper half does not. The addc expects the carry bit
+; set in vcc, which is undefined since the low scalar half add sets
+; scc instead.
+
+; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
+  %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0
+  %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1
+  %bc = bitcast <2 x i32> %vec.1 to i64
+  %add = add i64 %bc, 399
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
+  %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
+  %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1
+  %bc = bitcast <2 x i32> %vec.1 to i64
+  %add = add i64 %bc, %val1
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Doesn't use constants
+; FUNC-LABEL @imp_def_vcc_split_i64_add_2
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %load = load i32, i32 addrspace(1)* %gep
+  %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
+  %vec.1 = insertelement <2 x i32> %vec.0, i32 %load, i32 1
+  %bc = bitcast <2 x i32> %vec.1 to i64
+  %add = add i64 %bc, %val1
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
new file mode 100644
index 00000000000..bcbc32f4c05
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -0,0 +1,213 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s
+
+;EG-LABEL: {{^}}ashr_v2i32:
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-LABEL: {{^}}ashr_v2i32:
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+;VI-LABEL: {{^}}ashr_v2i32:
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = ashr <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-LABEL: {{^}}ashr_v4i32:
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-LABEL: {{^}}ashr_v4i32:
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+;VI-LABEL: {{^}}ashr_v4i32:
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = ashr <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-LABEL: {{^}}ashr_i64:
+;EG: ASHR
+
+;SI-LABEL: {{^}}ashr_i64:
+;SI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
+
+;VI-LABEL: {{^}}ashr_i64:
+;VI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
+
+define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = sext i32 %in to i64
+  %1 = ashr i64 %0, 8
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+;EG-LABEL: {{^}}ashr_i64_2:
+;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
+;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
+;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
+;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+
+;SI-LABEL: {{^}}ashr_i64_2:
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI-LABEL: {{^}}ashr_i64_2:
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+entry:
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %a = load i64, i64 addrspace(1) * %in
+  %b = load i64, i64 addrspace(1) * %b_ptr
+  %result = ashr i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;EG-LABEL: {{^}}ashr_v2i64:
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: ASHR {{.*}}, [[SHA]]
+;EG-DAG: ASHR {{.*}}, [[SHB]]
+;EG-DAG: LSHR {{.*}}, [[SHA]]
+;EG-DAG: LSHR {{.*}}, [[SHB]]
+;EG-DAG: OR_INT
+;EG-DAG: OR_INT
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ASHR
+;EG-DAG: ASHR
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+
+;SI-LABEL: {{^}}ashr_v2i64:
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI-LABEL: {{^}}ashr_v2i64:
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64>, <2 x i64> addrspace(1) * %in
+  %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr
+  %result = ashr <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+;EG-LABEL: {{^}}ashr_v4i64:
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHC]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHD]]
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: ASHR {{.*}}, [[SHA]]
+;EG-DAG: ASHR {{.*}}, [[SHB]]
+;EG-DAG: ASHR {{.*}}, [[SHC]]
+;EG-DAG: ASHR {{.*}}, [[SHD]]
+;EG-DAG: LSHR {{.*}}, [[SHA]]
+;EG-DAG: LSHR {{.*}}, [[SHB]]
+;EG-DAG: LSHR {{.*}}, [[SHA]]
+;EG-DAG: LSHR {{.*}}, [[SHB]]
+;EG-DAG: OR_INT
+;EG-DAG: OR_INT
+;EG-DAG: OR_INT
+;EG-DAG: OR_INT
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ASHR
+;EG-DAG: ASHR
+;EG-DAG: ASHR
+;EG-DAG: ASHR
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+
+;SI-LABEL: {{^}}ashr_v4i64:
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI-LABEL: {{^}}ashr_v4i64:
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64>, <4 x i64> addrspace(1) * %in
+  %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr
+  %result = ashr <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/srem.ll b/test/CodeGen/AMDGPU/srem.ll
new file mode 100644
index 00000000000..c78fd549b31
--- /dev/null
+++ b/test/CodeGen/AMDGPU/srem.ll
@@ -0,0 +1,112 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s
+
+define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in
+  %den = load i32, i32 addrspace(1) * %den_ptr
+  %result = srem i32 %num, %den
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32, i32 addrspace(1) * %in
+  %result = srem i32 %num, 4
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem_i32_7:
+; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493
+; SI: v_mul_hi_i32 {{v[0-9]+}}, [[MAGIC]],
+; SI: v_mul_lo_i32
+; SI: v_sub_i32
+; SI: s_endpgm
+define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32, i32 addrspace(1) * %in
+  %result = srem i32 %num, 7
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
+  %result = srem <2 x i32> %num, %den
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %result = srem <2 x i32> %num, <i32 4, i32 4>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
+  %result = srem <4 x i32> %num, %den
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %result = srem <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %num = load i64, i64 addrspace(1) * %in
+  %den = load i64, i64 addrspace(1) * %den_ptr
+  %result = srem i64 %num, %den
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %num = load i64, i64 addrspace(1) * %in
+  %result = srem i64 %num, 4
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
+  %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr
+  %result = srem <2 x i64> %num, %den
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
+  %result = srem <2 x i64> %num, <i64 4, i64 4>
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
+  %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr
+  %result = srem <4 x i64> %num, %den
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
+  %result = srem <4 x i64> %num, <i64 4, i64 4, i64 4, i64 4>
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
new file mode 100644
index 00000000000..4904d7fa1bd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -0,0 +1,186 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}lshr_i32:
+; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = lshr i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lshr_v2i32:
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
+  %result = lshr <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lshr_v4i32:
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
+  %result = lshr <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lshr_i64:
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
+; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
+define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %a = load i64, i64 addrspace(1)* %in
+  %b = load i64, i64 addrspace(1)* %b_ptr
+  %result = lshr i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lshr_v2i64:
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
+  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
+  %result = lshr <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lshr_v4i64:
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHC]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHD]]
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: LSHR {{.*}}, [[SHC]]
+; EG-DAG: LSHR {{.*}}, [[SHD]]
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: LSHR {{.*}}, [[SHC]]
+; EG-DAG: LSHR {{.*}}, [[SHD]]
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
+  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
+  %result = lshr <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ssubo.ll b/test/CodeGen/AMDGPU/ssubo.ll
new file mode 100644
index 00000000000..26884a1b776
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ssubo.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+
+declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+
+; FUNC-LABEL: {{^}}ssubo_i64_zext:
+define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %ssub, 0
+  %carry = extractvalue { i64, i1 } %ssub, 1
+  %ext = zext i1 %carry to i64
+  %add2 = add i64 %val, %ext
+  store i64 %add2, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_ssubo_i32:
+define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+  %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %ssub, 0
+  %carry = extractvalue { i32, i1 } %ssub, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ssubo_i32:
+define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %ssub, 0
+  %carry = extractvalue { i32, i1 } %ssub, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_ssubo_i64:
+; SI: s_sub_u32
+; SI: s_subb_u32
+define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+  %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %ssub, 0
+  %carry = extractvalue { i64, i1 } %ssub, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ssubo_i64:
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64, i64 addrspace(1)* %aptr, align 4
+  %b = load i64, i64 addrspace(1)* %bptr, align 4
+  %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %ssub, 0
+  %carry = extractvalue { i64, i1 } %ssub, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/store-barrier.ll b/test/CodeGen/AMDGPU/store-barrier.ll
new file mode 100644
index 00000000000..4a72b4d090a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store-barrier.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
+
+; This test is for a bug in the machine scheduler where stores without
+; an underlying object would be moved across the barrier.  In this
+; test, the <2 x i8> store will be split into two i8 stores, so they
+; won't have an underlying object.
+
+; CHECK-LABEL: {{^}}test:
+; CHECK: ds_write_b8
+; CHECK: ds_write_b8
+; CHECK: s_barrier
+; CHECK: s_endpgm
+; Function Attrs: nounwind
+define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) {
+bb:
+  %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9
+  %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2
+  %tmp14 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp13
+  %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 2
+  %tmp16 = add i32 %tmp13, 1
+  %tmp17 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp16
+  store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 2
+  tail call void @llvm.AMDGPU.barrier.local() #2
+  %tmp25 = load i32, i32 addrspace(1)* %tmp10, align 4
+  %tmp26 = sext i32 %tmp25 to i64
+  %tmp27 = sext i32 %arg4 to i64
+  %tmp28 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 %arg4
+  %tmp29 = load i8, i8 addrspace(3)* %tmp28, align 1
+  %tmp30 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 %tmp27
+  store i8 %tmp29, i8 addrspace(1)* %tmp30, align 1
+  %tmp32 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 0
+  %tmp33 = load i8, i8 addrspace(3)* %tmp32, align 1
+  %tmp35 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 0
+  store i8 %tmp33, i8 addrspace(1)* %tmp35, align 1
+  ret void
+}
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/store-v3i32.ll b/test/CodeGen/AMDGPU/store-v3i32.ll
new file mode 100644
index 00000000000..33617b55ed6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store-v3i32.ll
@@ -0,0 +1,13 @@
+; XFAIL: *
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+
+; 3 vectors have the same size and alignment as 4 vectors, so this
+; should be done in a single store.
+
+; SI-LABEL: {{^}}store_v3i32:
+; SI: buffer_store_dwordx4
+define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
+  store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/store-v3i64.ll b/test/CodeGen/AMDGPU/store-v3i64.ll
new file mode 100644
index 00000000000..e0c554ad2c1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -0,0 +1,29 @@
+; XFAIL: *
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}global_store_v3i64:
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32
+  ret void
+}
+
+; SI-LABEL: {{^}}global_store_v3i64_unaligned:
+define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}local_store_v3i64:
+define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
+  store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
+  ret void
+}
+
+; SI-LABEL: {{^}}local_store_v3i64_unaligned:
+define void @local_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/store-vector-ptrs.ll b/test/CodeGen/AMDGPU/store-vector-ptrs.ll
new file mode 100644
index 00000000000..d5af3b29118
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store-vector-ptrs.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s
+
+; This tests for a bug that caused a crash in
+; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting
+; scratch loads and stores.
+; CHECK-LABEL: {{^}}store_vector_ptrs:
+define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
+  %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
+  store <4 x i32*> %p, <4 x i32*>* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/store.ll b/test/CodeGen/AMDGPU/store.ll
new file mode 100644
index 00000000000..0f89405e073
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store.ll
@@ -0,0 +1,369 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+
+;===------------------------------------------------------------------------===;
+; Global Address Space
+;===------------------------------------------------------------------------===;
+; FUNC-LABEL: {{^}}store_i1:
+; EG: MEM_RAT MSKOR
+; SI: buffer_store_byte
+define void @store_i1(i1 addrspace(1)* %out) {
+entry:
+  store i1 true, i1 addrspace(1)* %out
+  ret void
+}
+
+; i8 store
+; FUNC-LABEL: {{^}}store_i8:
+; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
+
+; IG 0: Get the byte index and truncate the value
+; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
+; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+
+
+; IG 1: Truncate the calculated the shift amount for the mask
+
+; IG 2: Shift the value and the mask
+; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
+; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
+; EG-NEXT: 255
+; IG 3: Initialize the Y and Z channels to zero
+;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
+; EG: MOV T[[RW_GPR]].Y, 0.0
+; EG: MOV * T[[RW_GPR]].Z, 0.0
+
+; SI: buffer_store_byte
+
+define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
+entry:
+  store i8 %in, i8 addrspace(1)* %out
+  ret void
+}
+
+; i16 store
+; FUNC-LABEL: {{^}}store_i16:
+; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
+
+; IG 0: Get the byte index and truncate the value
+
+
+; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG-NEXT: 3(4.203895e-45),
+
+; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
+
+; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
+; IG 1: Truncate the calculated the shift amount for the mask
+
+; IG 2: Shift the value and the mask
+; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
+; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
+; EG-NEXT: 65535
+; IG 3: Initialize the Y and Z channels to zero
+;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
+; EG: MOV T[[RW_GPR]].Y, 0.0
+; EG: MOV * T[[RW_GPR]].Z, 0.0
+
+; SI: buffer_store_short
+define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
+entry:
+  store i16 %in, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v2i8:
+; EG: MEM_RAT MSKOR
+; EG-NOT: MEM_RAT MSKOR
+
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
+entry:
+  %0 = trunc <2 x i32> %in to <2 x i8>
+  store <2 x i8> %0, <2 x i8> addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}store_v2i16:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+
+; SI: buffer_store_short
+; SI: buffer_store_short
+define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
+entry:
+  %0 = trunc <2 x i32> %in to <2 x i16>
+  store <2 x i16> %0, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i8:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+entry:
+  %0 = trunc <4 x i32> %in to <4 x i8>
+  store <4 x i8> %0, <4 x i8> addrspace(1)* %out
+  ret void
+}
+
+; floating-point store
+; FUNC-LABEL: {{^}}store_f32:
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1
+
+; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
+
+; SI: buffer_store_dword
+
+define void @store_f32(float addrspace(1)* %out, float %in) {
+  store float %in, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i16:
+; EG: MEM_RAT MSKOR
+; EG: MEM_RAT MSKOR
+; EG: MEM_RAT MSKOR
+; EG: MEM_RAT MSKOR
+; EG-NOT: MEM_RAT MSKOR
+
+; SI: buffer_store_short
+; SI: buffer_store_short
+; SI: buffer_store_short
+; SI: buffer_store_short
+; SI-NOT: buffer_store_byte
+define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
+entry:
+  %0 = trunc <4 x i32> %in to <4 x i16>
+  store <4 x i16> %0, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; vec2 floating-point stores
+; FUNC-LABEL: {{^}}store_v2f32:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+
+; SI: buffer_store_dwordx2
+
+define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
+  %1 = insertelement <2 x float> %0, float %b, i32 1
+  store <2 x float> %1, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i32:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+; EG-NOT: MEM_RAT_CACHELESS STORE_RAW
+
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
+
+; SI: buffer_store_dwordx4
+define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_i64_i8:
+; EG: MEM_RAT MSKOR
+; SI: buffer_store_byte
+define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i8
+  store i8 %0, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_i64_i16:
+; EG: MEM_RAT MSKOR
+; SI: buffer_store_short
+define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i16
+  store i16 %0, i16 addrspace(1)* %out
+  ret void
+}
+
+;===------------------------------------------------------------------------===;
+; Local Address Space
+;===------------------------------------------------------------------------===;
+
+; FUNC-LABEL: {{^}}store_local_i1:
+; EG: LDS_BYTE_WRITE
+; SI: ds_write_b8
+define void @store_local_i1(i1 addrspace(3)* %out) {
+entry:
+  store i1 true, i1 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_i8:
+; EG: LDS_BYTE_WRITE
+
+; SI: ds_write_b8
+define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
+  store i8 %in, i8 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_i16:
+; EG: LDS_SHORT_WRITE
+
+; SI: ds_write_b16
+define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
+  store i16 %in, i16 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_v2i16:
+; EG: LDS_WRITE
+
+; CM: LDS_WRITE
+
+; SI: ds_write_b16
+; SI: ds_write_b16
+define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
+entry:
+  store <2 x i16> %in, <2 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_v4i8:
+; EG: LDS_WRITE
+
+; CM: LDS_WRITE
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+entry:
+  store <4 x i8> %in, <4 x i8> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_v2i32:
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+
+; SI: ds_write_b64
+define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
+entry:
+  store <2 x i32> %in, <2 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_v4i32:
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: ds_write_b32
+define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_i64_i8:
+; EG: LDS_BYTE_WRITE
+; SI: ds_write_b8
+define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i8
+  store i8 %0, i8 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_i64_i16:
+; EG: LDS_SHORT_WRITE
+; SI: ds_write_b16
+define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i16
+  store i16 %0, i16 addrspace(3)* %out
+  ret void
+}
+
+; The stores in this function are combined by the optimizer to create a
+; 64-bit store with 32-bit alignment.  This is legal for SI and the legalizer
+; should not try to split the 64-bit store back into 2 32-bit stores.
+;
+; Evergreen / Northern Islands don't support 64-bit stores yet, so there should
+; be two 32-bit stores.
+
+; FUNC-LABEL: {{^}}vecload2:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+
+; SI: buffer_store_dwordx2
+define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+entry:
+  %0 = load i32, i32 addrspace(2)* %mem, align 4
+  %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
+  %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; When i128 was a legal type this program generated cannot select errors:
+
+; FUNC-LABEL: {{^}}"i128-const-store":
+; FIXME: We should be able to to this with one store instruction
+; EG: STORE_RAW
+; EG: STORE_RAW
+; EG: STORE_RAW
+; EG: STORE_RAW
+; CM: STORE_DWORD
+; CM: STORE_DWORD
+; CM: STORE_DWORD
+; CM: STORE_DWORD
+; SI: buffer_store_dwordx4
+define void @i128-const-store(i32 addrspace(1)* %out) {
+entry:
+  store i32 1, i32 addrspace(1)* %out, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+  store i32 1, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
+  store i32 2, i32 addrspace(1)* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
+  store i32 2, i32 addrspace(1)* %arrayidx6, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/store.r600.ll b/test/CodeGen/AMDGPU/store.r600.ll
new file mode 100644
index 00000000000..696fb033b5e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store.r600.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+
+; XXX: Merge this test into store.ll once it is supported on SI
+
+; v4i32 store
+; EG: {{^}}store_v4i32:
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
+
+define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %1 = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; v4f32 store
+; EG: {{^}}store_v4f32:
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
+define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %1 = load <4 x float>, <4 x float> addrspace(1) * %in
+  store <4 x float> %1, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/structurize.ll b/test/CodeGen/AMDGPU/structurize.ll
new file mode 100644
index 00000000000..02e592e9a55
--- /dev/null
+++ b/test/CodeGen/AMDGPU/structurize.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood -mattr=disable-irstructurizer | FileCheck %s
+; Test case for a crash in the AMDILCFGStructurizer from a CFG like this:
+;
+;                            entry
+;                           /     \
+;               diamond_head       branch_from
+;                 /      \           |
+;    diamond_false        diamond_true
+;                 \      /
+;                   done
+;
+; When the diamond_true branch had more than 100 instructions.
+;
+;
+
+; CHECK-LABEL: {{^}}branch_into_diamond:
+; === entry block:
+; CHECK: ALU_PUSH_BEFORE
+; === Branch instruction (IF):
+; CHECK: JUMP
+  ; === branch_from block
+  ; CHECK: ALU
+  ; === Duplicated diamond_true block (There can be more than one ALU clause):
+  ; === XXX: We should be able to optimize this so the basic block is not
+  ; === duplicated.  See comments in
+  ; === AMDGPUCFGStructurizer::improveSimpleJumpintoIf()
+  ; CHECK: ALU
+; === Branch instruction (ELSE):
+; CHECK: ELSE
+  ; === diamond_head block:
+  ; CHECK: ALU_PUSH_BEFORE
+  ; === Branch instruction (IF):
+  ; CHECK: JUMP
+    ; === diamond_true block (There can be more than one ALU clause):
+    ; ALU
+  ; === Branch instruction (ELSE):
+  ; CHECK: ELSE
+    ; === diamond_false block plus implicit ENDIF
+    ; CHECK: ALU_POP_AFTER
+; === Branch instruction (ENDIF):
+; CHECK: POP
+; === done block:
+; CHECK: ALU
+; CHECK: MEM_RAT_CACHELESS
+; CHECK: CF_END
+
+
+define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+%0 = icmp ne i32 %a, 0
+  br i1 %0, label %diamond_head, label %branch_from
+
+diamond_head:
+  %1 = icmp ne i32 %a, 1
+  br i1 %1, label %diamond_true, label %diamond_false
+
+branch_from:
+  %2 = add i32 %a, 1
+  br label %diamond_true
+
+diamond_false:
+  %3 = add i32 %a, 2
+  br label %done
+
+diamond_true:
+  %4 = phi i32 [%2, %branch_from], [%a, %diamond_head]
+  ; This block needs to be > 100 ISA instructions to hit the bug,
+  ; so we'll use udiv instructions.
+  %div0 = udiv i32 %a, %b
+  %div1 = udiv i32 %div0, %4
+  %div2 = udiv i32 %div1, 11
+  %div3 = udiv i32 %div2, %a
+  %div4 = udiv i32 %div3, %b
+  %div5 = udiv i32 %div4, %c
+  %div6 = udiv i32 %div5, %div0
+  %div7 = udiv i32 %div6, %div1
+  br label %done
+
+done:
+  %5 = phi i32 [%3, %diamond_false], [%div7, %diamond_true]
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/structurize1.ll b/test/CodeGen/AMDGPU/structurize1.ll
new file mode 100644
index 00000000000..77432c1f9d2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/structurize1.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -march=r600 -mattr=disable-ifcvt -mcpu=redwood | FileCheck %s
+
+; This tests for abug where the AMDILCFGStructurizer was crashing on loops
+; like this:
+;
+; for (i = 0; i < x; i++) {
+;   if (cond0) {
+;     if (cond1) {
+;
+;     } else {
+;
+;     }
+;     if (cond2) {
+;
+;     }
+;   }
+; }
+
+; CHECK-LABEL: {{^}}if_inside_loop:
+; CHECK: LOOP_START_DX10
+; CHECK: END_LOOP
+define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  br label %for.body
+
+for.body:
+  %0 = phi i32 [0, %entry], [%inc, %for.inc]
+  %val = phi i32 [0, %entry], [%val.for.inc, %for.inc]
+  %inc = add i32 %0, 1
+  %1 = icmp ult i32 10, %a
+  br i1 %1, label %for.inc, label %if.then
+
+if.then:
+  %2 = icmp ne i32 0, %b
+  br i1 %2, label %if.then.true, label %if.then.false
+
+if.then.true:
+  %3 = add i32 %a, %val
+  br label %if
+
+if.then.false:
+  %4 = mul i32 %a, %val
+  br label %if
+
+if:
+  %val.if = phi i32 [%3, %if.then.true], [%4, %if.then.false]
+  %5 = icmp ne i32 0, %c
+  br i1 %5, label %if.true, label %for.inc
+
+if.true:
+  %6 = add i32 %a, %val.if
+  br label %for.inc
+
+for.inc:
+  %val.for.inc = phi i32 [%val, %for.body], [%val.if, %if], [%6, %if.true]
+  %7 = icmp ne i32 0, %d
+  br i1 %7, label %for.body, label %exit
+
+exit:
+  store i32 %val.for.inc, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll
new file mode 100644
index 00000000000..b7fba0efa5b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sub.ll
@@ -0,0 +1,130 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+declare i32 @llvm.r600.read.tidig.x() readnone
+
+; FUNC-LABEL: {{^}}test_sub_i32:
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_subrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = sub i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}test_sub_v2i32:
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = sub <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_sub_v4i32:
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = sub <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sub_i64:
+; SI: s_sub_u32
+; SI: s_subb_u32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{[* ]*}}[[LO]]
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
+define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
+  %result = sub i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sub_i64:
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{[* ]*}}[[LO]]
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
+define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a_ptr
+  %b = load i64, i64 addrspace(1)* %b_ptr
+  %result = sub i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_test_sub_v2i64:
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
+  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
+  %result = sub <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_test_sub_v4i64:
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr
+  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
+  %result = sub <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
new file mode 100644
index 00000000000..c4dae4736cf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
@@ -0,0 +1,109 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -o - %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s
+
+; SI-LABEL:{{^}}row_filter_C1_D0:
+; SI: s_endpgm
+; Function Attrs: nounwind
+define void @row_filter_C1_D0() {
+entry:
+  br i1 undef, label %for.inc.1, label %do.body.preheader
+
+do.body.preheader:                                ; preds = %entry
+  %0 = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1
+  br i1 undef, label %do.body56.1, label %do.body90
+
+do.body90:                                        ; preds = %do.body56.2, %do.body56.1, %do.body.preheader
+  %1 = phi <4 x i32> [ %6, %do.body56.2 ], [ %5, %do.body56.1 ], [ %0, %do.body.preheader ]
+  %2 = insertelement <4 x i32> %1, i32 undef, i32 2
+  %3 = insertelement <4 x i32> %2, i32 undef, i32 3
+  br i1 undef, label %do.body124.1, label %do.body.1562.preheader
+
+do.body.1562.preheader:                           ; preds = %do.body124.1, %do.body90
+  %storemerge = phi <4 x i32> [ %3, %do.body90 ], [ %7, %do.body124.1 ]
+  %4 = insertelement <4 x i32> undef, i32 undef, i32 1
+  br label %for.inc.1
+
+do.body56.1:                                      ; preds = %do.body.preheader
+  %5 = insertelement <4 x i32> %0, i32 undef, i32 1
+  %or.cond472.1 = or i1 undef, undef
+  br i1 %or.cond472.1, label %do.body56.2, label %do.body90
+
+do.body56.2:                                      ; preds = %do.body56.1
+  %6 = insertelement <4 x i32> %5, i32 undef, i32 1
+  br label %do.body90
+
+do.body124.1:                                     ; preds = %do.body90
+  %7 = insertelement <4 x i32> %3, i32 undef, i32 3
+  br label %do.body.1562.preheader
+
+for.inc.1:                                        ; preds = %do.body.1562.preheader, %entry
+  %storemerge591 = phi <4 x i32> [ zeroinitializer, %entry ], [ %storemerge, %do.body.1562.preheader ]
+  %add.i495 = add <4 x i32> %storemerge591, undef
+  unreachable
+}
+
+; SI-LABEL: {{^}}foo:
+; SI: s_endpgm
+define void @foo() #0 {
+bb:
+  br i1 undef, label %bb2, label %bb1
+
+bb1:                                              ; preds = %bb
+  br i1 undef, label %bb4, label %bb6
+
+bb2:                                              ; preds = %bb4, %bb
+  %tmp = phi float [ %tmp5, %bb4 ], [ 0.000000e+00, %bb ]
+  br i1 undef, label %bb9, label %bb13
+
+bb4:                                              ; preds = %bb7, %bb6, %bb1
+  %tmp5 = phi float [ undef, %bb1 ], [ undef, %bb6 ], [ %tmp8, %bb7 ]
+  br label %bb2
+
+bb6:                                              ; preds = %bb1
+  br i1 undef, label %bb7, label %bb4
+
+bb7:                                              ; preds = %bb6
+  %tmp8 = fmul float undef, undef
+  br label %bb4
+
+bb9:                                              ; preds = %bb2
+  %tmp10 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 2)
+  %tmp11 = extractelement <4 x float> %tmp10, i32 1
+  %tmp12 = extractelement <4 x float> %tmp10, i32 3
+  br label %bb14
+
+bb13:                                             ; preds = %bb2
+  br i1 undef, label %bb23, label %bb24
+
+bb14:                                             ; preds = %bb27, %bb24, %bb9
+  %tmp15 = phi float [ %tmp12, %bb9 ], [ undef, %bb27 ], [ 0.000000e+00, %bb24 ]
+  %tmp16 = phi float [ %tmp11, %bb9 ], [ undef, %bb27 ], [ %tmp25, %bb24 ]
+  %tmp17 = fmul float 10.5, %tmp16
+  %tmp18 = fmul float 11.5, %tmp15
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp18, float %tmp17, float %tmp17, float %tmp17)
+  ret void
+
+bb23:                                             ; preds = %bb13
+  br i1 undef, label %bb24, label %bb26
+
+bb24:                                             ; preds = %bb26, %bb23, %bb13
+  %tmp25 = phi float [ %tmp, %bb13 ], [ %tmp, %bb26 ], [ 0.000000e+00, %bb23 ]
+  br i1 undef, label %bb27, label %bb14
+
+bb26:                                             ; preds = %bb23
+  br label %bb24
+
+bb27:                                             ; preds = %bb24
+  br label %bb14
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll b/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
new file mode 100644
index 00000000000..8bd995a8ecb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck %s
+; LiveRangeEdit::eliminateDeadDef did not update LiveInterval sub ranges
+; properly.
+
+; Just make sure this test doesn't crash.
+; CHECK-LABEL: foobar:
+; CHECK: s_endpgm
+define void @foobar() {
+  %v0 = icmp eq <4 x i32> undef, <i32 0, i32 1, i32 2, i32 3>
+  %v3 = sext <4 x i1> %v0 to <4 x i32>
+  %v4 = extractelement <4 x i32> %v3, i32 1
+  %v5 = icmp ne i32 %v4, 0
+  %v6 = select i1 %v5, i32 undef, i32 0
+  %v15 = insertelement <2 x i32> undef, i32 %v6, i32 1
+  store <2 x i32> %v15, <2 x i32> addrspace(1)* undef, align 8
+  ret void
+}
+
+declare double @llvm.fma.f64(double, double, double)
diff --git a/test/CodeGen/AMDGPU/swizzle-export.ll b/test/CodeGen/AMDGPU/swizzle-export.ll
new file mode 100644
index 00000000000..000ee2faa47
--- /dev/null
+++ b/test/CodeGen/AMDGPU/swizzle-export.ll
@@ -0,0 +1,129 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+
+;EG: {{^}}main:
+;EG: EXPORT T{{[0-9]+}}.XYXX
+;EG: EXPORT T{{[0-9]+}}.ZXXX
+;EG: EXPORT T{{[0-9]+}}.XXWX
+;EG: EXPORT T{{[0-9]+}}.XXXW
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = load <4 x float>, <4 x float> addrspace(8)* null
+  %5 = extractelement <4 x float> %4, i32 1
+  %6 = load <4 x float>, <4 x float> addrspace(8)* null
+  %7 = extractelement <4 x float> %6, i32 2
+  %8 = load <4 x float>, <4 x float> addrspace(8)* null
+  %9 = extractelement <4 x float> %8, i32 0
+  %10 = fmul float 0.000000e+00, %9
+  %11 = load <4 x float>, <4 x float> addrspace(8)* null
+  %12 = extractelement <4 x float> %11, i32 0
+  %13 = fmul float %5, %12
+  %14 = load <4 x float>, <4 x float> addrspace(8)* null
+  %15 = extractelement <4 x float> %14, i32 0
+  %16 = fmul float 0.000000e+00, %15
+  %17 = load <4 x float>, <4 x float> addrspace(8)* null
+  %18 = extractelement <4 x float> %17, i32 0
+  %19 = fmul float 0.000000e+00, %18
+  %20 = load <4 x float>, <4 x float> addrspace(8)* null
+  %21 = extractelement <4 x float> %20, i32 0
+  %22 = fmul float %7, %21
+  %23 = load <4 x float>, <4 x float> addrspace(8)* null
+  %24 = extractelement <4 x float> %23, i32 0
+  %25 = fmul float 0.000000e+00, %24
+  %26 = load <4 x float>, <4 x float> addrspace(8)* null
+  %27 = extractelement <4 x float> %26, i32 0
+  %28 = fmul float 0.000000e+00, %27
+  %29 = load <4 x float>, <4 x float> addrspace(8)* null
+  %30 = extractelement <4 x float> %29, i32 0
+  %31 = fmul float 0.000000e+00, %30
+  %32 = load <4 x float>, <4 x float> addrspace(8)* null
+  %33 = extractelement <4 x float> %32, i32 0
+  %34 = fmul float 0.000000e+00, %33
+  %35 = load <4 x float>, <4 x float> addrspace(8)* null
+  %36 = extractelement <4 x float> %35, i32 0
+  %37 = fmul float 0.000000e+00, %36
+  %38 = load <4 x float>, <4 x float> addrspace(8)* null
+  %39 = extractelement <4 x float> %38, i32 0
+  %40 = fmul float 1.000000e+00, %39
+  %41 = load <4 x float>, <4 x float> addrspace(8)* null
+  %42 = extractelement <4 x float> %41, i32 0
+  %43 = fmul float 0.000000e+00, %42
+  %44 = load <4 x float>, <4 x float> addrspace(8)* null
+  %45 = extractelement <4 x float> %44, i32 0
+  %46 = fmul float 0.000000e+00, %45
+  %47 = load <4 x float>, <4 x float> addrspace(8)* null
+  %48 = extractelement <4 x float> %47, i32 0
+  %49 = fmul float 0.000000e+00, %48
+  %50 = load <4 x float>, <4 x float> addrspace(8)* null
+  %51 = extractelement <4 x float> %50, i32 0
+  %52 = fmul float 0.000000e+00, %51
+  %53 = load <4 x float>, <4 x float> addrspace(8)* null
+  %54 = extractelement <4 x float> %53, i32 0
+  %55 = fmul float 1.000000e+00, %54
+  %56 = insertelement <4 x float> undef, float %0, i32 0
+  %57 = insertelement <4 x float> %56, float %1, i32 1
+  %58 = insertelement <4 x float> %57, float %2, i32 2
+  %59 = insertelement <4 x float> %58, float %3, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %59, i32 60, i32 1)
+  %60 = insertelement <4 x float> undef, float %10, i32 0
+  %61 = insertelement <4 x float> %60, float %13, i32 1
+  %62 = insertelement <4 x float> %61, float %16, i32 2
+  %63 = insertelement <4 x float> %62, float %19, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %63, i32 0, i32 2)
+  %64 = insertelement <4 x float> undef, float %22, i32 0
+  %65 = insertelement <4 x float> %64, float %25, i32 1
+  %66 = insertelement <4 x float> %65, float %28, i32 2
+  %67 = insertelement <4 x float> %66, float %31, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %67, i32 1, i32 2)
+  %68 = insertelement <4 x float> undef, float %34, i32 0
+  %69 = insertelement <4 x float> %68, float %37, i32 1
+  %70 = insertelement <4 x float> %69, float %40, i32 2
+  %71 = insertelement <4 x float> %70, float %43, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %71, i32 2, i32 2)
+  %72 = insertelement <4 x float> undef, float %46, i32 0
+  %73 = insertelement <4 x float> %72, float %49, i32 1
+  %74 = insertelement <4 x float> %73, float %52, i32 2
+  %75 = insertelement <4 x float> %74, float %55, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %75, i32 3, i32 2)
+  ret void
+}
+
+; EG: {{^}}main2:
+; EG: T{{[0-9]+}}.XY__
+; EG: T{{[0-9]+}}.ZXY0
+
+define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = fadd float %0, 2.5
+  %3 = fmul float %1, 3.5
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = call float @llvm.cos.f32(float %5)
+  %7 = load <4 x float>, <4 x float> addrspace(8)* null
+  %8 = extractelement <4 x float> %7, i32 0
+  %9 = load <4 x float>, <4 x float> addrspace(8)* null
+  %10 = extractelement <4 x float> %9, i32 1
+  %11 = insertelement <4 x float> undef, float %2, i32 0
+  %12 = insertelement <4 x float> %11, float %3, i32 1
+  call void @llvm.R600.store.swizzle(<4 x float> %12, i32 60, i32 1)
+  %13 = insertelement <4 x float> undef, float %6, i32 0
+  %14 = insertelement <4 x float> %13, float %8, i32 1
+  %15 = insertelement <4 x float> %14, float %10, i32 2
+  %16 = insertelement <4 x float> %15, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %16, i32 0, i32 2)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare float @llvm.cos.f32(float) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/tex-clause-antidep.ll b/test/CodeGen/AMDGPU/tex-clause-antidep.ll
new file mode 100644
index 00000000000..cbb9c50974a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/tex-clause-antidep.ll
@@ -0,0 +1,25 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: TEX
+;CHECK-NEXT: ALU
+
+define void @test(<4 x float> inreg %reg0) #0 {
+  %1 = extractelement <4 x float> %reg0, i32 0
+  %2 = extractelement <4 x float> %reg0, i32 1
+  %3 = extractelement <4 x float> %reg0, i32 2
+  %4 = extractelement <4 x float> %reg0, i32 3
+  %5 = insertelement <4 x float> undef, float %1, i32 0
+  %6 = insertelement <4 x float> %5, float %2, i32 1
+  %7 = insertelement <4 x float> %6, float %3, i32 2
+  %8 = insertelement <4 x float> %7, float %4, i32 3
+  %9 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %10 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %11 = fadd <4 x float> %9, %10
+  call void @llvm.R600.store.swizzle(<4 x float> %11, i32 0, i32 0)
+  ret void
+}
+
+declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/texture-input-merge.ll b/test/CodeGen/AMDGPU/texture-input-merge.ll
new file mode 100644
index 00000000000..789538af582
--- /dev/null
+++ b/test/CodeGen/AMDGPU/texture-input-merge.ll
@@ -0,0 +1,31 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK-NOT: MOV
+
+define void @test(<4 x float> inreg %reg0) #0 {
+  %1 = extractelement <4 x float> %reg0, i32 0
+  %2 = extractelement <4 x float> %reg0, i32 1
+  %3 = extractelement <4 x float> %reg0, i32 2
+  %4 = extractelement <4 x float> %reg0, i32 3
+  %5 = fmul float %1, 3.0
+  %6 = fmul float %2, 3.0
+  %7 = fmul float %3, 3.0
+  %8 = fmul float %4, 3.0
+  %9 = insertelement <4 x float> undef, float %5, i32 0
+  %10 = insertelement <4 x float> %9, float %6, i32 1
+  %11 = insertelement <4 x float> undef, float %7, i32 0
+  %12 = insertelement <4 x float> %11, float %5, i32 1
+  %13 = insertelement <4 x float> undef, float %8, i32 0
+  %14 = call <4 x float> @llvm.R600.tex(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %15 = call <4 x float> @llvm.R600.tex(<4 x float> %12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %16 = call <4 x float> @llvm.R600.tex(<4 x float> %13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %17 = fadd <4 x float> %14, %15
+  %18 = fadd <4 x float> %17, %16
+  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 0)
+  ret void
+}
+
+declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
new file mode 100644
index 00000000000..dac74728b3c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
@@ -0,0 +1,170 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_eq_0:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
+; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1{{$}}
+; SI: v_cndmask_b32_e64
+; SI: buffer_store_byte
+define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp eq i32 %ext, 0
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: The negate should be inverting the compare.
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_0:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
+; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp eq i32 %ext, 0
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1:
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp eq i32 %ext, 1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_1:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp eq i32 %ext, 1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_neg1:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp eq i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1:
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp eq i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_ne_0:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp ne i32 %ext, 0
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_0:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp ne i32 %ext, 0
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1:
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
+; SI: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp ne i32 %ext, 1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_1:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
+; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp ne i32 %ext, 1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This should be one compare.
+; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_neg1:
+; XSI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; XSI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; XSI: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}}
+; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]]
+; XSI-NEXT: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp ne i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1:
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
+; SI: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp ne i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}masked_load_i1_to_i32_trunc_cmp_ne_neg1:
+; SI: buffer_load_sbyte [[LOAD:v[0-9]+]]
+; SI: v_cmp_ne_i32_e32 vcc, -1, [[LOAD]]{{$}}
+; SI-NEXT: v_cndmask_b32_e64
+; SI-NEXT: buffer_store_byte
+define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %load = load i8, i8 addrspace(1)* %in
+  %masked = and i8 %load, 255
+  %ext = sext i8 %masked to i32
+  %cmp = icmp ne i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll b/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
new file mode 100644
index 00000000000..c29872beef8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
@@ -0,0 +1,56 @@
+; XFAIL: *
+; RUN: llc -march=amdgcn -mcpu=SI < %s
+
+; GCN-LABEL: {{^}}global_truncstore_f64_to_f16:
+; GCN: s_endpgm
+define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %val = load double, double addrspace(1)* %in
+  %cvt = fptrunc double %val to half
+  store half %cvt, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16:
+; GCN: s_endpgm
+define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+  %val = load <2 x double>, <2 x double> addrspace(1)* %in
+  %cvt = fptrunc <2 x double> %val to <2 x half>
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16:
+; GCN: s_endpgm
+define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+  %val = load <3 x double>, <3 x double> addrspace(1)* %in
+  %cvt = fptrunc <3 x double> %val to <3 x half>
+  store <3 x half> %cvt, <3 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16:
+; GCN: s_endpgm
+define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+  %val = load <4 x double>, <4 x double> addrspace(1)* %in
+  %cvt = fptrunc <4 x double> %val to <4 x half>
+  store <4 x half> %cvt, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16:
+; GCN: s_endpgm
+define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
+  %val = load <8 x double>, <8 x double> addrspace(1)* %in
+  %cvt = fptrunc <8 x double> %val to <8 x half>
+  store <8 x half> %cvt, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16:
+; GCN: s_endpgm
+define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
+  %val = load <16 x double>, <16 x double> addrspace(1)* %in
+  %cvt = fptrunc <16 x double> %val to <16 x half>
+  store <16 x half> %cvt, <16 x half> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/trunc-store-i1.ll b/test/CodeGen/AMDGPU/trunc-store-i1.ll
new file mode 100644
index 00000000000..b71a838b62c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/trunc-store-i1.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+
+
+; SI-LABEL: {{^}}global_truncstore_i32_to_i1:
+; SI: s_load_dword [[LOAD:s[0-9]+]],
+; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
+; SI: buffer_store_byte [[VREG]],
+define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
+  %trunc = trunc i32 %val to i1
+  store i1 %trunc, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}global_truncstore_i64_to_i1:
+; SI: buffer_store_byte
+define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
+  %trunc = trunc i64 %val to i1
+  store i1 %trunc, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
+; SI: s_load_dword [[LOAD:s[0-9]+]],
+; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
+; SI: buffer_store_byte [[VREG]],
+define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
+  %trunc = trunc i16 %val to i1
+  store i1 %trunc, i1 addrspace(1)* %out, align 1
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll b/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
new file mode 100644
index 00000000000..878ea3f4899
--- /dev/null
+++ b/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This tests for a bug in the SelectionDAG where custom lowered truncated
+; vector stores at the end of a basic block were not being added to the
+; LegalizedNodes list, which triggered an assertion failure.
+
+; CHECK-LABEL: {{^}}test:
+; CHECK: MEM_RAT_CACHELESS STORE_RAW
+define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  br i1 %0, label %if, label %done
+
+if:
+  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
+  br label %done
+
+done:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll
new file mode 100644
index 00000000000..bf690ca4cb2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/trunc.ll
@@ -0,0 +1,100 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
+; SI-LABEL: {{^}}trunc_i64_to_i32_store:
+; SI: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], 0xb
+; SI: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
+; SI: buffer_store_dword [[VLOAD]]
+
+; EG-LABEL: {{^}}trunc_i64_to_i32_store:
+; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG: LSHR
+; EG-NEXT: 2(
+
+  %result = trunc i64 %in to i32 store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}trunc_load_shl_i64:
+; SI-DAG: s_load_dwordx2
+; SI-DAG: s_load_dword [[SREG:s[0-9]+]],
+; SI: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2
+; SI: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]]
+; SI: buffer_store_dword [[VSHL]],
+define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
+  %b = shl i64 %a, 2
+  %result = trunc i64 %b to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}trunc_shl_i64:
+; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2
+; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
+; SI: s_addc_u32
+; SI: v_mov_b32_e32
+; SI: v_mov_b32_e32
+; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
+; SI: buffer_store_dword v[[LO_VREG]],
+define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
+  %aa = add i64 %a, 234 ; Prevent shrinking store.
+  %b = shl i64 %aa, 2
+  %result = trunc i64 %b to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i64 %b, i64 addrspace(1)* %out2, align 8 ; Prevent reducing ops to 32-bits
+  ret void
+}
+
+; SI-LABEL: {{^}}trunc_i32_to_i1:
+; SI: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: v_cmp_eq_i32
+define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
+  %a = load i32, i32 addrspace(1)* %ptr, align 4
+  %trunc = trunc i32 %a to i1
+  %result = select i1 %trunc, i32 1, i32 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}sgpr_trunc_i32_to_i1:
+; SI: v_and_b32_e64 v{{[0-9]+}}, 1, s{{[0-9]+}}
+; SI: v_cmp_eq_i32
+define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
+  %trunc = trunc i32 %a to i1
+  %result = select i1 %trunc, i32 1, i32 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}s_trunc_i64_to_i1:
+; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_and_b32_e64 [[MASKED:v[0-9]+]], 1, s[[SLO]]
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]]
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
+define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
+  %trunc = trunc i64 %x to i1
+  %sel = select i1 %trunc, i32 63, i32 -12
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}v_trunc_i64_to_i1:
+; SI: buffer_load_dwordx2 v{{\[}}[[VLO:[0-9]+]]:{{[0-9]+\]}}
+; SI: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]]
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
+define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %x = load i64, i64 addrspace(1)* %gep
+
+  %trunc = trunc i64 %x to i1
+  %sel = select i1 %trunc, i32 63, i32 -12
+  store i32 %sel, i32 addrspace(1)* %out.gep
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/tti-unroll-prefs.ll b/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
new file mode 100644
index 00000000000..76c32afc1f2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
@@ -0,0 +1,58 @@
+; RUN: opt -loop-unroll -S -mtriple=amdgcn-- -mcpu=SI %s | FileCheck %s
+
+; This IR comes from this OpenCL C code:
+;
+; if (b + 4 > a) {
+;   for (int i = 0; i < 4; i++, b++) {
+;     if (b + 1 <= a)
+;       *(dst + c + b) = 0;
+;     else
+;       break;
+;   }
+; }
+;
+; This test is meant to check that this loop isn't unrolled into more than
+; four iterations.  The loop unrolling preferences we currently use cause this
+; loop to not be unrolled at all, but that may change in the future.
+
+; CHECK-LABEL: @test
+; CHECK: store i8 0, i8 addrspace(1)*
+; CHECK-NOT: store i8 0, i8 addrspace(1)*
+; CHECK: ret void
+define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
+entry:
+  %add = add nsw i32 %b, 4
+  %cmp = icmp sgt i32 %add, %a
+  br i1 %cmp, label %for.cond.preheader, label %if.end7
+
+for.cond.preheader:                               ; preds = %entry
+  %cmp313 = icmp slt i32 %b, %a
+  br i1 %cmp313, label %if.then4.lr.ph, label %if.end7.loopexit
+
+if.then4.lr.ph:                                   ; preds = %for.cond.preheader
+  %0 = sext i32 %c to i64
+  br label %if.then4
+
+if.then4:                                         ; preds = %if.then4.lr.ph, %if.then4
+  %i.015 = phi i32 [ 0, %if.then4.lr.ph ], [ %inc, %if.then4 ]
+  %b.addr.014 = phi i32 [ %b, %if.then4.lr.ph ], [ %add2, %if.then4 ]
+  %add2 = add nsw i32 %b.addr.014, 1
+  %1 = sext i32 %b.addr.014 to i64
+  %add.ptr.sum = add nsw i64 %1, %0
+  %add.ptr5 = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %add.ptr.sum
+  store i8 0, i8 addrspace(1)* %add.ptr5, align 1
+  %inc = add nsw i32 %i.015, 1
+  %cmp1 = icmp slt i32 %inc, 4
+  %cmp3 = icmp slt i32 %add2, %a
+  %or.cond = and i1 %cmp3, %cmp1
+  br i1 %or.cond, label %if.then4, label %for.cond.if.end7.loopexit_crit_edge
+
+for.cond.if.end7.loopexit_crit_edge:              ; preds = %if.then4
+  br label %if.end7.loopexit
+
+if.end7.loopexit:                                 ; preds = %for.cond.if.end7.loopexit_crit_edge, %for.cond.preheader
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.end7.loopexit, %entry
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/uaddo.ll b/test/CodeGen/AMDGPU/uaddo.ll
new file mode 100644
index 00000000000..11438f267ad
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uaddo.ll
@@ -0,0 +1,85 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+
+; FUNC-LABEL: {{^}}uaddo_i64_zext:
+; SI: add
+; SI: addc
+; SI: addc
+
+; EG: ADDC_UINT
+; EG: ADDC_UINT
+define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %uadd, 0
+  %carry = extractvalue { i64, i1 } %uadd, 1
+  %ext = zext i1 %carry to i64
+  %add2 = add i64 %val, %ext
+  store i64 %add2, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_uaddo_i32:
+; SI: s_add_i32
+
+; EG: ADDC_UINT
+; EG: ADD_INT
+define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %uadd, 0
+  %carry = extractvalue { i32, i1 } %uadd, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_uaddo_i32:
+; SI: v_add_i32
+
+; EG: ADDC_UINT
+; EG: ADD_INT
+define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %uadd, 0
+  %carry = extractvalue { i32, i1 } %uadd, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_uaddo_i64:
+; SI: s_add_u32
+; SI: s_addc_u32
+
+; EG: ADDC_UINT
+; EG: ADD_INT
+define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %uadd, 0
+  %carry = extractvalue { i64, i1 } %uadd, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_uaddo_i64:
+; SI: v_add_i32
+; SI: v_addc_u32
+
+; EG: ADDC_UINT
+; EG: ADD_INT
+define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64, i64 addrspace(1)* %aptr, align 4
+  %b = load i64, i64 addrspace(1)* %bptr, align 4
+  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %uadd, 0
+  %carry = extractvalue { i64, i1 } %uadd, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll
new file mode 100644
index 00000000000..de22a22e502
--- /dev/null
+++ b/test/CodeGen/AMDGPU/udiv.ll
@@ -0,0 +1,48 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+;EG-LABEL: {{^}}test:
+;EG-NOT: SETGE_INT
+;EG: CF_END
+
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1) * %in
+  %b = load i32, i32 addrspace(1) * %b_ptr
+  %result = udiv i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+;The code generated by udiv is long and complex and may frequently change.
+;The goal of this test is to make sure the ISel doesn't fail when it gets
+;a v4i32 udiv
+
+;EG-LABEL: {{^}}test2:
+;EG: CF_END
+;SI-LABEL: {{^}}test2:
+;SI: s_endpgm
+
+define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = udiv <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-LABEL: {{^}}test4:
+;EG: CF_END
+;SI-LABEL: {{^}}test4:
+;SI: s_endpgm
+
+define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = udiv <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/udivrem.ll b/test/CodeGen/AMDGPU/udivrem.ll
new file mode 100644
index 00000000000..b3837f28209
--- /dev/null
+++ b/test/CodeGen/AMDGPU/udivrem.ll
@@ -0,0 +1,345 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_udivrem:
+; EG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG: CNDE_INT
+; EG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG: CNDE_INT
+; EG: MULHI
+; EG: MULLO_INT
+; EG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: v_rcp_iflag_f32_e32 [[RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]]
+; SI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]]
+; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]]
+; SI: v_cndmask_b32_e64
+; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]]
+; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]]
+; SI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]]
+; SI: v_cndmask_b32_e64
+; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]]
+; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI: s_endpgm
+define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+  %result0 = udiv i32 %x, %y
+  store i32 %result0, i32 addrspace(1)* %out
+  %result1 = urem i32 %x, %y
+  store i32 %result1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_udivrem_v2:
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
+; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
+; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI: s_endpgm
+define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+  %result0 = udiv <2 x i32> %x, %y
+  store <2 x i32> %result0, <2 x i32> addrspace(1)* %out
+  %result1 = urem <2 x i32> %x, %y
+  store <2 x i32> %result1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}test_udivrem_v4:
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
+; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[l0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
+; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[THIRD_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]]
+; SI-DAG: v_mul_lo_i32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]]
+; SI-DAG: v_sub_i32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]]
+; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[THIRD_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[THIRD_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder:v[0-9]+]], [[THIRD_Num_S_Remainder]], {{v[0-9]+}}
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[THIRD_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[THIRD_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[FOURTH_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]]
+; SI-DAG: v_mul_lo_i32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]]
+; SI-DAG: v_sub_i32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]]
+; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
+; SI-DAG: v_cndmask_b32_e64
+; SI: s_endpgm
+define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+  %result0 = udiv <4 x i32> %x, %y
+  store <4 x i32> %result0, <4 x i32> addrspace(1)* %out
+  %result1 = urem <4 x i32> %x, %y
+  store <4 x i32> %result1, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/udivrem24.ll b/test/CodeGen/AMDGPU/udivrem24.ll
new file mode 100644
index 00000000000..4de881b66f1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/udivrem24.ll
@@ -0,0 +1,245 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}udiv24_i8:
+; SI: v_cvt_f32_ubyte
+; SI: v_cvt_f32_ubyte
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %num = load i8, i8 addrspace(1) * %in
+  %den = load i8, i8 addrspace(1) * %den_ptr
+  %result = udiv i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv24_i16:
+; SI: v_cvt_f32_u32
+; SI: v_cvt_f32_u32
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+  %num = load i16, i16 addrspace(1) * %in, align 2
+  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
+  %result = udiv i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv24_i32:
+; SI: v_cvt_f32_u32
+; SI-DAG: v_cvt_f32_u32
+; SI-DAG: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv25_i32:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_udiv24_i32_1:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_udiv24_i32_2:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem24_i8:
+; SI: v_cvt_f32_ubyte
+; SI: v_cvt_f32_ubyte
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %num = load i8, i8 addrspace(1) * %in
+  %den = load i8, i8 addrspace(1) * %den_ptr
+  %result = urem i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem24_i16:
+; SI: v_cvt_f32_u32
+; SI: v_cvt_f32_u32
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+  %num = load i16, i16 addrspace(1) * %in, align 2
+  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
+  %result = urem i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem24_i32:
+; SI: v_cvt_f32_u32
+; SI: v_cvt_f32_u32
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem25_i32:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_urem24_i32_1:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_urem24_i32_2:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/udivrem64.ll b/test/CodeGen/AMDGPU/udivrem64.ll
new file mode 100644
index 00000000000..9f3069bdf80
--- /dev/null
+++ b/test/CodeGen/AMDGPU/udivrem64.ll
@@ -0,0 +1,223 @@
+;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+
+;FUNC-LABEL: {{^}}test_udiv:
+;EG: RECIP_UINT
+;EG: LSHL {{.*}}, 1,
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = udiv i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_urem:
+;EG: RECIP_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: AND_INT {{.*}}, 1,
+
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = urem i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_udiv3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 33
+  %2 = lshr i64 %y, 33
+  %result = udiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_urem3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 33
+  %2 = lshr i64 %y, 33
+  %result = urem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_udiv2464:
+;EG: UINT_TO_FLT
+;EG: UINT_TO_FLT
+;EG: FLT_TO_UINT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: v_mad_f32
+;GCN: s_endpgm
+define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 40
+  %2 = lshr i64 %y, 40
+  %result = udiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_urem2464:
+;EG: UINT_TO_FLT
+;EG: UINT_TO_FLT
+;EG: FLT_TO_UINT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: v_mad_f32
+;GCN: s_endpgm
+define void @test_urem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 40
+  %2 = lshr i64 %y, 40
+  %result = urem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
new file mode 100644
index 00000000000..dfec8eb15cb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -0,0 +1,98 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: {{^}}v_uint_to_fp_i64_to_f64
+; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; SI: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
+; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
+; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep, align 8
+  %result = uitofp i64 %val to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64
+define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+  %cast = uitofp i64 %in to double
+  store double %cast, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64
+define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
+  %cast = uitofp <2 x i64> %in to <2 x double>
+  store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64
+define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
+  %cast = uitofp <4 x i64> %in to <4 x double>
+  store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64
+; SI: v_cvt_f64_u32_e32
+; SI: s_endpgm
+define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+  %cast = uitofp i32 %in to double
+  store double %cast, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_v2i32_to_v2f64
+; SI: v_cvt_f64_u32_e32
+; SI: v_cvt_f64_u32_e32
+; SI: s_endpgm
+define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) {
+  %cast = uitofp <2 x i32> %in to <2 x double>
+  store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_v4i32_to_v4f64
+; SI: v_cvt_f64_u32_e32
+; SI: v_cvt_f64_u32_e32
+; SI: v_cvt_f64_u32_e32
+; SI: v_cvt_f64_u32_e32
+; SI: s_endpgm
+define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) {
+  %cast = uitofp <4 x i32> %in to <4 x double>
+  store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FIXME: select on 0, 0
+; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
+; uses an SGPR for [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = uitofp i1 %cmp to double
+  store double %fp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}uint_to_fp_i1_to_f64_load:
+; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, 1
+; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) {
+  %fp = uitofp i1 %in to double
+  store double %fp, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.ll b/test/CodeGen/AMDGPU/uint_to_fp.ll
new file mode 100644
index 00000000000..00fea80b1bc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uint_to_fp.ll
@@ -0,0 +1,82 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}uint_to_fp_i32_to_f32:
+; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+
+; SI: v_cvt_f32_u32_e32
+; SI: s_endpgm
+define void @uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) {
+  %result = uitofp i32 %in to float
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}uint_to_fp_v2i32_to_v2f32:
+; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
+
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: s_endpgm
+define void @uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+  %result = uitofp <2 x i32> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}uint_to_fp_v4i32_to_v4f32:
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: s_endpgm
+define void @uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %result = uitofp <4 x i32> %value to <4 x float>
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}uint_to_fp_i64_to_f32:
+; R600: UINT_TO_FLT
+; R600: UINT_TO_FLT
+; R600: MULADD_IEEE
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x4f800000
+; SI: s_endpgm
+define void @uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) {
+entry:
+  %0 = uitofp i64 %in to float
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = uitofp i1 %cmp to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32_load:
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) {
+  %fp = uitofp i1 %in to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/unaligned-load-store.ll b/test/CodeGen/AMDGPU/unaligned-load-store.ll
new file mode 100644
index 00000000000..82d88ebd3ae
--- /dev/null
+++ b/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -0,0 +1,254 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}unaligned_load_store_i16_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: s_endpgm
+define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind {
+  %v = load i16, i16 addrspace(3)* %p, align 1
+  store i16 %v, i16 addrspace(3)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i16_global:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind {
+  %v = load i16, i16 addrspace(1)* %p, align 1
+  store i16 %v, i16 addrspace(1)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i32_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: s_endpgm
+define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
+  %v = load i32, i32 addrspace(3)* %p, align 1
+  store i32 %v, i32 addrspace(3)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i32_global:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind {
+  %v = load i32, i32 addrspace(1)* %p, align 1
+  store i32 %v, i32 addrspace(1)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i64_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: s_endpgm
+define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) {
+  %v = load i64, i64 addrspace(3)* %p, align 1
+  store i64 %v, i64 addrspace(3)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i64_global:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) {
+  %v = load i64, i64 addrspace(1)* %p, align 1
+  store i64 %v, i64 addrspace(1)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_v4i32_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: s_endpgm
+define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
+  %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
+  store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
+  ret void
+}
+
+; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded.
+; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind {
+  %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
+  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}load_lds_i64_align_4:
+; SI: ds_read2_b32
+; SI: s_endpgm
+define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %val = load i64, i64 addrspace(3)* %in, align 4
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
+; SI: s_endpgm
+define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
+  %val = load i64, i64 addrspace(3)* %ptr, align 4
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset:
+; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
+; SI: s_endpgm
+define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
+  %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
+  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
+  %val = load i64, i64 addrspace(3)* %ptri64, align 4
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}load_lds_i64_align_1:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+
+define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %val = load i64, i64 addrspace(3)* %in, align 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}store_lds_i64_align_4:
+; SI: ds_write2_b32
+; SI: s_endpgm
+define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
+  store i64 %val, i64 addrspace(3)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset
+; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
+; SI: s_endpgm
+define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
+  %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
+  store i64 0, i64 addrspace(3)* %ptr, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset:
+; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
+; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; SI: s_endpgm
+define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
+  %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
+  %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
+  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
+  store i64 0, i64 addrspace(3)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
new file mode 100644
index 00000000000..036a7e91b47
--- /dev/null
+++ b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
@@ -0,0 +1,115 @@
+; REQUIRES: asserts
+; XFAIL: *
+; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s
+
+; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
+
+; COMMON-LABEL: {{^}}branch_true:
+define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+entry:
+  br i1 true, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %add.ptr.sum = shl i32 %main_stride, 1
+  %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride
+  %add.ptr4.sum = shl i32 %main_stride, 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
+  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
+  %1 = load i32, i32 addrspace(1)* %0, align 4
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
+  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %3 = load i32, i32 addrspace(1)* %2, align 4
+  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
+  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  %5 = load i32, i32 addrspace(1)* %4, align 4
+  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
+  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
+  %7 = load i32, i32 addrspace(1)* %6, align 4
+  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
+  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
+  %9 = load i32, i32 addrspace(1)* %8, align 4
+  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; COMMON-LABEL: {{^}}branch_false:
+; SI: .text
+; SI-NEXT: s_endpgm
+define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+entry:
+  br i1 false, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %add.ptr.sum = shl i32 %main_stride, 1
+  %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride
+  %add.ptr4.sum = shl i32 %main_stride, 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
+  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
+  %1 = load i32, i32 addrspace(1)* %0, align 4
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
+  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %3 = load i32, i32 addrspace(1)* %2, align 4
+  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
+  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  %5 = load i32, i32 addrspace(1)* %4, align 4
+  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
+  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
+  %7 = load i32, i32 addrspace(1)* %6, align 4
+  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
+  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
+  %9 = load i32, i32 addrspace(1)* %8, align 4
+  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; COMMON-LABEL: {{^}}branch_undef:
+; SI: .text
+; SI-NEXT: s_endpgm
+define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+entry:
+  br i1 undef, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %add.ptr.sum = shl i32 %main_stride, 1
+  %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride
+  %add.ptr4.sum = shl i32 %main_stride, 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
+  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
+  %1 = load i32, i32 addrspace(1)* %0, align 4
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
+  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %3 = load i32, i32 addrspace(1)* %2, align 4
+  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
+  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  %5 = load i32, i32 addrspace(1)* %4, align 4
+  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
+  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
+  %7 = load i32, i32 addrspace(1)* %6, align 4
+  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
+  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
+  %9 = load i32, i32 addrspace(1)* %8, align 4
+  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/unroll.ll b/test/CodeGen/AMDGPU/unroll.ll
new file mode 100644
index 00000000000..411a15a4b83
--- /dev/null
+++ b/test/CodeGen/AMDGPU/unroll.ll
@@ -0,0 +1,36 @@
+; RUN: opt -mtriple=amdgcn-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s
+; RUN: opt -mtriple=r600-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s
+
+
+; This test contains a simple loop that initializes an array declared in
+; private memory.  We want to make sure these kinds of loops are always
+; unrolled, because private memory is slow.
+
+; CHECK-LABEL: @test
+; CHECK-NOT: alloca
+; CHECK: store i32 5, i32 addrspace(1)* %out
+define void @test(i32 addrspace(1)* %out) {
+entry:
+  %0 = alloca [32 x i32]
+  br label %loop.header
+
+loop.header:
+  %counter = phi i32 [0, %entry], [%inc, %loop.inc]
+  br label %loop.body
+
+loop.body:
+  %ptr = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 %counter
+  store i32 %counter, i32* %ptr
+  br label %loop.inc
+
+loop.inc:
+  %inc = add i32 %counter, 1
+  %1 = icmp sge i32 %counter, 32
+  br i1 %1, label  %exit, label %loop.header
+
+exit:
+  %2 = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 5
+  %3 = load i32, i32* %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/unsupported-cc.ll b/test/CodeGen/AMDGPU/unsupported-cc.ll
new file mode 100644
index 00000000000..8ab4faf2f14
--- /dev/null
+++ b/test/CodeGen/AMDGPU/unsupported-cc.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; These tests are for condition codes that are not supported by the hardware
+
+; CHECK-LABEL: {{^}}slt:
+; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 5(7.006492e-45)
+define void @slt(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp slt i32 %in, 5
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ult_i32:
+; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 5(7.006492e-45)
+define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp ult i32 %in, 5
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ult_float:
+; CHECK: SETGE * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x
+; CHECK-NEXT: 1084227584(5.000000e+00)
+; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
+; CHECK-NEXT: LSHR *
+define void @ult_float(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ult float %in, 5.0
+  %1 = select i1 %0, float 1.0, float 0.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ult_float_native:
+; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR *
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @ult_float_native(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ult float %in, 5.0
+  %1 = select i1 %0, float 0.0, float 1.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}olt:
+; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR *
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @olt(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp olt float %in, 5.0
+  %1 = select i1 %0, float 1.0, float 0.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}sle:
+; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 6(8.407791e-45)
+define void @sle(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sle i32 %in, 5
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ule_i32:
+; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 6(8.407791e-45)
+define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp ule i32 %in, 5
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ule_float:
+; CHECK: SETGT * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x
+; CHECK-NEXT: 1084227584(5.000000e+00)
+; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
+; CHECK-NEXT: LSHR *
+define void @ule_float(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ule float %in, 5.0
+  %1 = select i1 %0, float 1.0, float 0.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ule_float_native:
+; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR *
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @ule_float_native(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ule float %in, 5.0
+  %1 = select i1 %0, float 0.0, float 1.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ole:
+; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR *
+; CHECK-NEXT:1084227584(5.000000e+00)
+define void @ole(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ole float %in, 5.0
+  %1 = select i1 %0, float 1.0, float 0.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/urecip.ll b/test/CodeGen/AMDGPU/urecip.ll
new file mode 100644
index 00000000000..daacc771708
--- /dev/null
+++ b/test/CodeGen/AMDGPU/urecip.ll
@@ -0,0 +1,13 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK: v_rcp_iflag_f32_e32
+
+define void @test(i32 %p, i32 %q) {
+   %i = udiv i32 %p, %q
+   %r = bitcast i32 %i to float
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
+   ret void
+}
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/urem.ll b/test/CodeGen/AMDGPU/urem.ll
new file mode 100644
index 00000000000..62841ec2d6c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/urem.ll
@@ -0,0 +1,94 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; The code generated by urem is long and complex and may frequently
+; change.  The goal of this test is to make sure the ISel doesn't fail
+; when it gets a v2i32/v4i32 urem
+
+; FUNC-LABEL: {{^}}test_urem_i32:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = urem i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_i32_7:
+; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925
+; SI: v_mul_hi_u32 {{v[0-9]+}}, [[MAGIC]]
+; SI: v_subrev_i32
+; SI: v_mul_lo_i32
+; SI: v_sub_i32
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32, i32 addrspace(1) * %in
+  %result = urem i32 %num, 7
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_v2i32:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
+  %result = urem <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_v4i32:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
+  %result = urem <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_i64:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %a = load i64, i64 addrspace(1)* %in
+  %b = load i64, i64 addrspace(1)* %b_ptr
+  %result = urem i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_v2i64:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
+  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
+  %result = urem <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_v4i64:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
+  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
+  %result = urem <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
new file mode 100644
index 00000000000..f26f30022b4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -0,0 +1,103 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1
+
+
+; GCN-LABEL: {{^}}test_sgpr_use_twice_binop:
+; GCN: s_load_dword [[SGPR:s[0-9]+]],
+; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
+  %dbl = fadd float %a, %a
+  store float %dbl, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sgpr_use_three_ternary_op:
+; GCN: s_load_dword [[SGPR:s[0-9]+]],
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
+; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
+; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
+; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_imm:
+; GCN: s_load_dword [[SGPR:s[0-9]+]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
+; GCN: s_load_dword [[SGPR:s[0-9]+]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Don't use fma since fma c, x, y is canonicalized to fma x, c, y
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a:
+; GCN: s_load_dword [[SGPR:s[0-9]+]]
+; GCN: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32 %a) #0 {
+  %fma = call i32 @llvm.AMDGPU.imad24(i32 2, i32 %a, i32 %a) #1
+  store i32 %fma, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/usubo.ll b/test/CodeGen/AMDGPU/usubo.ll
new file mode 100644
index 00000000000..3c9b1622a07
--- /dev/null
+++ b/test/CodeGen/AMDGPU/usubo.ll
@@ -0,0 +1,86 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+
+; FUNC-LABEL: {{^}}usubo_i64_zext:
+
+; EG: SUBB_UINT
+; EG: ADDC_UINT
+define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %usub, 0
+  %carry = extractvalue { i64, i1 } %usub, 1
+  %ext = zext i1 %carry to i64
+  %add2 = add i64 %val, %ext
+  store i64 %add2, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_usubo_i32:
+; SI: s_sub_i32
+
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+  %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %usub, 0
+  %carry = extractvalue { i32, i1 } %usub, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_usubo_i32:
+; SI: v_subrev_i32_e32
+
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %usub, 0
+  %carry = extractvalue { i32, i1 } %usub, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_usubo_i64:
+; SI: s_sub_u32
+; SI: s_subb_u32
+
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG: SUB_INT
+define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %usub, 0
+  %carry = extractvalue { i64, i1 } %usub, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_usubo_i64:
+; SI: v_sub_i32
+; SI: v_subb_u32
+
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG: SUB_INT
+define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64, i64 addrspace(1)* %aptr, align 4
+  %b = load i64, i64 addrspace(1)* %bptr, align 4
+  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %usub, 0
+  %carry = extractvalue { i64, i1 } %usub, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll b/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
new file mode 100644
index 00000000000..31755125c03
--- /dev/null
+++ b/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
@@ -0,0 +1,17 @@
+; REQUIRES: asserts
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}kernel_arg_i64:
+define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+  store i64 %a, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; i64 arg works, v1i64 arg does not.
+; CHECK-LABEL: {{^}}kernel_arg_v1i64:
+define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+  store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/v_cndmask.ll b/test/CodeGen/AMDGPU/v_cndmask.ll
new file mode 100644
index 00000000000..c368c5aaf7d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; SI-LABEL: {{^}}v_cnd_nan_nosgpr:
+; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}}
+; SI-DAG: v{{[0-9]}}
+; All nan values are converted to 0xffffffff
+; SI: s_endpgm
+define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
+  %idx = call i32 @llvm.r600.read.tidig.x() #1
+  %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
+  %f = load float, float addrspace(1)* %fptr
+  %setcc = icmp ne i32 %c, 0
+  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
+  store float %select, float addrspace(1)* %out
+  ret void
+}
+
+
+; This requires slightly trickier SGPR operand legalization since the
+; single constant bus SGPR usage is the last operand, and it should
+; never be moved.
+
+; SI-LABEL: {{^}}v_cnd_nan:
+; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}}
+; SI-DAG: v{{[0-9]}}
+; All nan values are converted to 0xffffffff
+; SI: s_endpgm
+define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
+  %setcc = icmp ne i32 %c, 0
+  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
+  store float %select, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll
new file mode 100644
index 00000000000..7d0ebd139f5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/valu-i1.ll
@@ -0,0 +1,188 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: @test_if
+; Make sure the i1 values created by the cfg structurizer pass are
+; moved using VALU instructions
+; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
+; SI: v_mov_b32_e32 v{{[0-9]}}, -1
+define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
+entry:
+  switch i32 %a, label %default [
+    i32 0, label %case0
+    i32 1, label %case1
+  ]
+
+case0:
+  %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
+  store i32 0, i32 addrspace(1)* %arrayidx1, align 4
+  br label %end
+
+case1:
+  %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
+  store i32 1, i32 addrspace(1)* %arrayidx5, align 4
+  br label %end
+
+default:
+  %cmp8 = icmp eq i32 %a, 2
+  %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
+  br i1 %cmp8, label %if, label %else
+
+if:
+  store i32 2, i32 addrspace(1)* %arrayidx10, align 4
+  br label %end
+
+else:
+  store i32 3, i32 addrspace(1)* %arrayidx10, align 4
+  br label %end
+
+end:
+  ret void
+}
+
+; SI-LABEL: @simple_test_v_if
+; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+
+; SI: ; BB#1
+; SI: buffer_store_dword
+; SI: s_endpgm
+
+; SI: BB1_2:
+; SI: s_or_b64 exec, exec, [[BR_SREG]]
+; SI: s_endpgm
+define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %is.0 = icmp ne i32 %tid, 0
+  br i1 %is.0, label %store, label %exit
+
+store:
+  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
+  store i32 999, i32 addrspace(1)* %gep
+  ret void
+
+exit:
+  ret void
+}
+
+; SI-LABEL: @simple_test_v_loop
+; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: s_cbranch_execz BB2_2
+
+; SI: ; BB#1:
+; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
+
+; SI: BB2_3:
+; SI: buffer_load_dword
+; SI: buffer_store_dword
+; SI: v_cmp_eq_i32_e32 vcc,
+; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]]
+; SI: s_andn2_b64 exec, exec, [[OR_SREG]]
+; SI: s_cbranch_execnz BB2_3
+
+define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+entry:
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %is.0 = icmp ne i32 %tid, 0
+  %limit = add i32 %tid, 64
+  br i1 %is.0, label %loop, label %exit
+
+loop:
+  %i = phi i32 [%tid, %entry], [%i.inc, %loop]
+  %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i
+  %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i
+  %load = load i32, i32 addrspace(1)* %src
+  store i32 %load, i32 addrspace(1)* %gep.dst
+  %i.inc = add nsw i32 %i, 1
+  %cmp = icmp eq i32 %limit, %i.inc
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; SI-LABEL: @multi_vcond_loop
+
+; Load loop limit from buffer
+; Branch to exit if uniformly not taken
+; SI: ; BB#0:
+; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
+; SI: v_cmp_lt_i32_e32 vcc
+; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
+; SI: s_cbranch_execz BB3_2
+
+; Initialize inner condition to false
+; SI: ; BB#1:
+; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
+
+; Clear exec bits for workitems that load -1s
+; SI: BB3_3:
+; SI: buffer_load_dword [[B:v[0-9]+]]
+; SI: buffer_load_dword [[A:v[0-9]+]]
+; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
+; SI-DAG: v_cmp_ne_i32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
+; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
+; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]]
+; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]]
+; SI: s_cbranch_execz BB3_5
+
+; SI: BB#4:
+; SI: buffer_store_dword
+; SI: v_cmp_ge_i64_e32 vcc
+; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]
+
+; SI: BB3_5:
+; SI: s_or_b64 exec, exec, [[ORNEG1]]
+; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]]
+; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
+; SI: s_cbranch_execnz BB3_3
+
+; SI: BB#6
+; SI: s_or_b64 exec, exec, [[COND_STATE]]
+
+; SI: BB3_2:
+; SI-NOT: [[COND_STATE]]
+; SI: s_endpgm
+
+define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
+bb:
+  %tmp = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tmp4 = sext i32 %tmp to i64
+  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4
+  %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
+  %tmp7 = icmp sgt i32 %tmp6, 0
+  %tmp8 = sext i32 %tmp6 to i64
+  br i1 %tmp7, label %bb10, label %bb26
+
+bb10:                                             ; preds = %bb, %bb20
+  %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
+  %tmp12 = add nsw i64 %tmp11, %tmp4
+  %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12
+  %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
+  %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12
+  %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
+  %tmp17 = icmp ne i32 %tmp14, -1
+  %tmp18 = icmp ne i32 %tmp16, -1
+  %tmp19 = and i1 %tmp17, %tmp18
+  br i1 %tmp19, label %bb20, label %bb26
+
+bb20:                                             ; preds = %bb10
+  %tmp21 = add nsw i32 %tmp16, %tmp14
+  %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12
+  store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
+  %tmp23 = add nuw nsw i64 %tmp11, 1
+  %tmp24 = icmp slt i64 %tmp23, %tmp8
+  br i1 %tmp24, label %bb10, label %bb26
+
+bb26:                                             ; preds = %bb10, %bb20, %bb
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll
new file mode 100644
index 00000000000..6f3b4847fbd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -0,0 +1,77 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}vector_read:
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOVA_INT
+define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 1, i32* %y
+  store i32 2, i32* %z
+  store i32 3, i32* %w
+  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %index
+  %2 = load i32, i32* %1
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_write:
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOVA_INT
+; EG: MOVA_INT
+define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+entry:
+  %0 = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 0, i32* %y
+  store i32 0, i32* %z
+  store i32 0, i32* %w
+  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %w_index
+  store i32 1, i32* %1
+  %2 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %r_index
+  %3 = load i32, i32* %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; This test should be optimize to:
+; store i32 0, i32 addrspace(1)* %out
+; FUNC-LABEL: {{^}}bitcast_gep:
+; EG: STORE_RAW
+define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+entry:
+  %0 = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 0, i32* %y
+  store i32 0, i32* %z
+  store i32 0, i32* %w
+  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
+  %2 = bitcast i32* %1 to [4 x i32]*
+  %3 = getelementptr [4 x i32], [4 x i32]* %2, i32 0, i32 0
+  %4 = load i32, i32* %3
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll b/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
new file mode 100644
index 00000000000..fb6a17e6714
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=barts | FileCheck --check-prefix=NI %s
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM %s
+
+; NI: {{^}}vtx_fetch32:
+; NI: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
+; CM: {{^}}vtx_fetch32:
+; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
+
+define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(1)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; NI: {{^}}vtx_fetch128:
+; NI: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00
+; XXX: Add a case for Cayman when v4i32 stores are supported.
+
+define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  store <4 x i32> %0, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/vop-shrink.ll b/test/CodeGen/AMDGPU/vop-shrink.ll
new file mode 100644
index 00000000000..9b2f229c05a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vop-shrink.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; Test that we correctly commute a sub instruction
+; FUNC-LABEL: {{^}}sub_rev:
+; SI-NOT: v_sub_i32_e32 v{{[0-9]+}}, s
+; SI: v_subrev_i32_e32 v{{[0-9]+}}, s
+
+; ModuleID = 'vop-shrink.ll'
+
+define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
+entry:
+  %vgpr = call i32 @llvm.r600.read.tidig.x() #1
+  %tmp = icmp eq i32 %cond, 0
+  br i1 %tmp, label %if, label %else
+
+if:                                               ; preds = %entry
+  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %tmp2 = extractelement <4 x i32> %sgpr, i32 1
+  store i32 %tmp2, i32 addrspace(1)* %out
+  br label %endif
+
+else:                                             ; preds = %entry
+  %tmp3 = extractelement <4 x i32> %sgpr, i32 2
+  %tmp4 = sub i32 %vgpr, %tmp3
+  store i32 %tmp4, i32 addrspace(1)* %out
+  br label %endif
+
+endif:                                            ; preds = %else, %if
+  ret void
+}
+
+; Test that we fold an immediate that was illegal for a 64-bit op into the
+; 32-bit op when we shrink it.
+
+; FUNC-LABEL: {{^}}add_fold:
+; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000
+define void @add_fold(float addrspace(1)* %out) {
+entry:
+  %tmp = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = uitofp i32 %tmp to float
+  %tmp2 = fadd float %tmp1, 1.024000e+03
+  store float %tmp2, float addrspace(1)* %out
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll
new file mode 100644
index 00000000000..a3014b03d2b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vselect.ll
@@ -0,0 +1,77 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+;EG: {{^}}test_select_v2i32:
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: {{^}}test_select_v2i32:
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
+
+define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
+entry:
+  %0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0
+  %1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1
+  %cmp = icmp ne <2 x i32> %0, %1
+  %result = select <2 x i1> %cmp, <2 x i32> %0, <2 x i32> %1
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG: {{^}}test_select_v2f32:
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: {{^}}test_select_v2f32:
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
+
+define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
+entry:
+  %0 = load <2 x float>, <2 x float> addrspace(1)* %in0
+  %1 = load <2 x float>, <2 x float> addrspace(1)* %in1
+  %cmp = fcmp une <2 x float> %0, %1
+  %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+;EG: {{^}}test_select_v4i32:
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: {{^}}test_select_v4i32:
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
+
+define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+entry:
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1
+  %cmp = icmp ne <4 x i32> %0, %1
+  %result = select <4 x i1> %cmp, <4 x i32> %0, <4 x i32> %1
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG: {{^}}test_select_v4f32:
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
+entry:
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %in0
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %in1
+  %cmp = fcmp une <4 x float> %0, %1
+  %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/vselect64.ll b/test/CodeGen/AMDGPU/vselect64.ll
new file mode 100644
index 00000000000..ef85ebe7899
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vselect64.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck  %s
+; XXX: Merge this test into vselect.ll once SI supports 64-bit select.
+
+; CHECK-LABEL: {{^}}test_select_v4i64:
+; Make sure the vectors aren't being stored on the stack.  We know they are
+; being stored on the stack if the shaders uses at leat 10 registers.
+; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X
+define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
+entry:
+       %cmp = icmp ne  <4 x i32> %c, <i32 0, i32 0, i32 0, i32 0>
+       %result = select <4 x i1> %cmp, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> <i64 4, i64 5, i64 6, i64 7>
+       store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+       ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/vtx-fetch-branch.ll b/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
new file mode 100644
index 00000000000..4584d6e2525
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=r600 -mcpu=redwood %s -o - | FileCheck %s
+
+; This tests for a bug where vertex fetch clauses right before an ENDIF
+; instruction where being emitted after the ENDIF.  We were using ALU_POP_AFTER
+; for the ALU clause before the vetex fetch instead of emitting a POP instruction
+; after the fetch clause.
+
+
+; CHECK-LABEL: {{^}}test:
+; CHECK-NOT: ALU_POP_AFTER
+; CHECK: TEX
+; CHECK-NEXT: POP
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  br i1 %0, label %endif, label %if
+
+if:
+  %1 = load i32, i32 addrspace(1)* %in
+  br label %endif
+
+endif:
+  %x = phi i32 [ %1, %if], [ 0, %entry]
+  store i32 %x, i32 addrspace(1)* %out
+  br label %done
+
+done:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/vtx-schedule.ll b/test/CodeGen/AMDGPU/vtx-schedule.ll
new file mode 100644
index 00000000000..912e258ebb8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vtx-schedule.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This test is for a scheduler bug where VTX_READ instructions that used
+; the result of another VTX_READ instruction were being grouped in the
+; same fetch clasue.
+
+; CHECK: {{^}}test:
+; CHECK: Fetch clause
+; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0
+; CHECK: Fetch clause
+; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0
+define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
+entry:
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0
+  %1 = load i32, i32 addrspace(1)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/wait.ll b/test/CodeGen/AMDGPU/wait.ll
new file mode 100644
index 00000000000..5cc7577cad3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/wait.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: s_load_dwordx4
+; CHECK: s_load_dwordx4
+; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; CHECK: s_endpgm
+define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
+main_body:
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0
+  %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6)
+  %tmp12 = extractelement <4 x float> %tmp11, i32 0
+  %tmp13 = extractelement <4 x float> %tmp11, i32 1
+  call void @llvm.AMDGPU.barrier.global() #1
+  %tmp14 = extractelement <4 x float> %tmp11, i32 2
+;  %tmp15 = extractelement <4 x float> %tmp11, i32 3
+  %tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt
+  %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1
+  %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0
+  %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6)
+  %tmp19 = extractelement <4 x float> %tmp18, i32 0
+  %tmp20 = extractelement <4 x float> %tmp18, i32 1
+  %tmp21 = extractelement <4 x float> %tmp18, i32 2
+  %tmp22 = extractelement <4 x float> %tmp18, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15)
+  ret void
+}
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.global() #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { noduplicate nounwind }
+attributes #2 = { nounwind readnone }
+
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
new file mode 100644
index 00000000000..4328e964c1b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
@@ -0,0 +1,238 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}ngroups_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].X
+
+; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @ngroups_x (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.ngroups.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ngroups_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].Y
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @ngroups_y (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.ngroups.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ngroups_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].Z
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @ngroups_z (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.ngroups.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_size_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].W
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @global_size_x (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.global.size.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_size_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].X
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @global_size_y (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.global.size.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_size_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].Y
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @global_size_z (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.global.size.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].Z
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_x (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].W
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_y (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].X
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_z (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}get_work_dim:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].Z
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @get_work_dim (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.AMDGPU.read.workdim() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; The tgid values are stored in sgprs offset by the number of user sgprs.
+; Currently we always use exactly 2 user sgprs for the pointer to the
+; kernel arguments, but this may change in the future.
+
+; FUNC-LABEL: {{^}}tgid_x:
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4
+; GCN: buffer_store_dword [[VVAL]]
+define void @tgid_x (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tgid.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tgid_y:
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5
+; GCN: buffer_store_dword [[VVAL]]
+define void @tgid_y (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tgid.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tgid_z:
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6
+; GCN: buffer_store_dword [[VVAL]]
+define void @tgid_z (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tgid.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tidig_x:
+; GCN: buffer_store_dword v0
+define void @tidig_x (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tidig_y:
+; GCN: buffer_store_dword v1
+define void @tidig_y (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tidig_z:
+; GCN: buffer_store_dword v2
+define void @tidig_z (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.ngroups.x() #0
+declare i32 @llvm.r600.read.ngroups.y() #0
+declare i32 @llvm.r600.read.ngroups.z() #0
+
+declare i32 @llvm.r600.read.global.size.x() #0
+declare i32 @llvm.r600.read.global.size.y() #0
+declare i32 @llvm.r600.read.global.size.z() #0
+
+declare i32 @llvm.r600.read.local.size.x() #0
+declare i32 @llvm.r600.read.local.size.y() #0
+declare i32 @llvm.r600.read.local.size.z() #0
+
+declare i32 @llvm.r600.read.tgid.x() #0
+declare i32 @llvm.r600.read.tgid.y() #0
+declare i32 @llvm.r600.read.tgid.z() #0
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.r600.read.tidig.y() #0
+declare i32 @llvm.r600.read.tidig.z() #0
+
+declare i32 @llvm.AMDGPU.read.workdim() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
new file mode 100644
index 00000000000..8b383e4c393
--- /dev/null
+++ b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
@@ -0,0 +1,81 @@
+; RUN: llc -march=r600 -mcpu=redwood -mtriple=r600-- < %s | FileCheck %s
+
+; We want all MULLO_INT inst to be last in their instruction group
+;CHECK: {{^}}fill3d:
+;CHECK-NOT: MULLO_INT T[0-9]+
+
+define void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
+entry:
+  %x.i = tail call i32 @llvm.r600.read.global.size.x() #1
+  %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1
+  %mul = mul i32 %y.i18, %x.i
+  %z.i17 = tail call i32 @llvm.r600.read.global.size.z() #1
+  %mul3 = mul i32 %mul, %z.i17
+  %x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1
+  %x.i12.i = tail call i32 @llvm.r600.read.local.size.x() #1
+  %mul26.i = mul i32 %x.i12.i, %x.i.i
+  %x.i4.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.i16 = add i32 %x.i4.i, %mul26.i
+  %mul7 = mul i32 %add.i16, %y.i18
+  %y.i.i = tail call i32 @llvm.r600.read.tgid.y() #1
+  %y.i14.i = tail call i32 @llvm.r600.read.local.size.y() #1
+  %mul30.i = mul i32 %y.i14.i, %y.i.i
+  %y.i6.i = tail call i32 @llvm.r600.read.tidig.y() #1
+  %add.i14 = add i32 %mul30.i, %mul7
+  %mul819 = add i32 %add.i14, %y.i6.i
+  %add = mul i32 %mul819, %z.i17
+  %z.i.i = tail call i32 @llvm.r600.read.tgid.z() #1
+  %z.i16.i = tail call i32 @llvm.r600.read.local.size.z() #1
+  %mul33.i = mul i32 %z.i16.i, %z.i.i
+  %z.i8.i = tail call i32 @llvm.r600.read.tidig.z() #1
+  %add.i = add i32 %z.i8.i, %mul33.i
+  %add13 = add i32 %add.i, %add
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add13
+  store i32 %mul3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.z() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.z() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.z() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.global.size.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.global.size.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.global.size.z() #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!opencl.kernels = !{!0, !1, !2}
+
+!0 = !{null}
+!1 = !{null}
+!2 = !{void (i32 addrspace(1)*)* @fill3d}
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
new file mode 100644
index 00000000000..089db59eabc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -0,0 +1,173 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}xor_v2i32:
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1
+  %result = xor <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}xor_v4i32:
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1
+  %result = xor <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}xor_i1:
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+
+; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
+; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}
+; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+  %a = load float, float addrspace(1) * %in0
+  %b = load float, float addrspace(1) * %in1
+  %acmp = fcmp oge float %a, 0.000000e+00
+  %bcmp = fcmp oge float %b, 1.000000e+00
+  %xor = xor i1 %acmp, %bcmp
+  %result = select i1 %xor, float %a, float %b
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_xor_i1:
+; SI: buffer_load_ubyte [[B:v[0-9]+]]
+; SI: buffer_load_ubyte [[A:v[0-9]+]]
+; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]]
+; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
+; SI: buffer_store_byte [[RESULT]]
+define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
+  %a = load i1, i1 addrspace(1)* %in0
+  %b = load i1, i1 addrspace(1)* %in1
+  %xor = xor i1 %a, %b
+  store i1 %xor, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_xor_i32:
+; SI: v_xor_b32_e32
+define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+  %a = load i32, i32 addrspace(1)* %in0
+  %b = load i32, i32 addrspace(1)* %in1
+  %result = xor i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_xor_i32:
+; SI: s_xor_b32
+define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %result = xor i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_not_i32:
+; SI: s_not_b32
+define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
+  %result = xor i32 %a, -1
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_not_i32:
+; SI: v_not_b32
+define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+  %a = load i32, i32 addrspace(1)* %in0
+  %b = load i32, i32 addrspace(1)* %in1
+  %result = xor i32 %a, -1
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_xor_i64:
+; SI: v_xor_b32_e32
+; SI: v_xor_b32_e32
+; SI: s_endpgm
+define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+  %a = load i64, i64 addrspace(1)* %in0
+  %b = load i64, i64 addrspace(1)* %in1
+  %result = xor i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_xor_i64:
+; SI: s_xor_b64
+; SI: s_endpgm
+define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %result = xor i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_not_i64:
+; SI: s_not_b64
+define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
+  %result = xor i64 %a, -1
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_not_i64:
+; SI: v_not_b32
+; SI: v_not_b32
+define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+  %a = load i64, i64 addrspace(1)* %in0
+  %b = load i64, i64 addrspace(1)* %in1
+  %result = xor i64 %a, -1
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; Test that we have a pattern to match xor inside a branch.
+; Note that in the future the backend may be smart enough to
+; use an SALU instruction for this.
+
+; FUNC-LABEL: {{^}}xor_cf:
+; SI: s_xor_b64
+define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = xor i64 %a, %b
+  br label %endif
+
+else:
+  %2 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/zero_extend.ll b/test/CodeGen/AMDGPU/zero_extend.ll
new file mode 100644
index 00000000000..033055db185
--- /dev/null
+++ b/test/CodeGen/AMDGPU/zero_extend.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
+
+; R600: {{^}}test:
+; R600: MEM_RAT_CACHELESS STORE_RAW
+; R600: MEM_RAT_CACHELESS STORE_RAW
+
+; SI: {{^}}test:
+; SI: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}}
+; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
+; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
+define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = mul i32 %a, %b
+  %1 = add i32 %0, %c
+  %2 = zext i32 %1 to i64
+  store i64 %2, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}testi1toi32:
+; SI: v_cndmask_b32
+define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp eq i32 %a, %b
+  %1 = zext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}zext_i1_to_i64:
+; SI: s_mov_b32 s{{[0-9]+}}, 0
+; SI: v_cmp_eq_i32
+; SI: v_cndmask_b32
+define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp eq i32 %a, %b
+  %ext = zext i1 %cmp to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/32-bit-local-address-space.ll b/test/CodeGen/R600/32-bit-local-address-space.ll
deleted file mode 100644
index c7bcfd2ddab..00000000000
--- a/test/CodeGen/R600/32-bit-local-address-space.ll
+++ /dev/null
@@ -1,139 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and
-; the global address space(1) uses 64-bit pointers.  These tests check to make sure
-; the correct pointer size is used for the local address space.
-
-; The e{{32|64}} suffix on the instructions refers to the encoding size and not
-; the size of the operands.  The operand size is denoted in the instruction name.
-; Instructions with B32, U32, and I32 in their name take 32-bit operands, while
-; instructions with B64, U64, and I64 take 64-bit operands.
-
-; FUNC-LABEL: {{^}}local_address_load:
-; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
-; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
-define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
-entry:
-  %0 = load i32, i32 addrspace(3)* %in
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_address_gep:
-; SI: s_add_i32 [[SPTR:s[0-9]]]
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_read_b32 [[VPTR]]
-define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
-entry:
-  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset
-  %1 = load i32, i32 addrspace(3)* %0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_address_gep_const_offset:
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
-; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
-define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
-entry:
-  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1
-  %1 = load i32, i32 addrspace(3)* %0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; Offset too large, can't fold into 16-bit immediate offset.
-; FUNC-LABEL: {{^}}local_address_gep_large_const_offset:
-; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_read_b32 [[VPTR]]
-define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
-entry:
-  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385
-  %1 = load i32, i32 addrspace(3)* %0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}null_32bit_lds_ptr:
-; SI: v_cmp_ne_i32
-; SI-NOT: v_cmp_ne_i32
-; SI: v_cndmask_b32
-define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
-  %cmp = icmp ne i32 addrspace(3)* %lds, null
-  %x = select i1 %cmp, i32 123, i32 456
-  store i32 %x, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}mul_32bit_ptr:
-; SI: s_mul_i32
-; SI-NEXT: s_add_i32
-; SI: ds_read_b32
-define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
-  %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
-  %val = load float, float addrspace(3)* %ptr
-  store float %val, float addrspace(1)* %out
-  ret void
-}
-
-@g_lds = addrspace(3) global float undef, align 4
-
-; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
-; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
-define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
-  %val = load float, float addrspace(3)* @g_lds
-  store float %val, float addrspace(1)* %out
-  ret void
-}
-
-
-@ptr = addrspace(3) global i32 addrspace(3)* undef
-@dst = addrspace(3) global [16384 x i32] undef
-
-; FUNC-LABEL: {{^}}global_ptr:
-; SI: ds_write_b32
-define void @global_ptr() nounwind {
-  store i32 addrspace(3)* getelementptr ([16384 x i32], [16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_address_store:
-; SI: ds_write_b32
-define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
-  store i32 %val, i32 addrspace(3)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_address_gep_store:
-; SI: s_add_i32 [[SADDR:s[0-9]+]],
-; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
-; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
-define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
-  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset
-  store i32 %val, i32 addrspace(3)* %gep, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_address_gep_const_offset_store:
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
-; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
-; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
-define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
-  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1
-  store i32 %val, i32 addrspace(3)* %gep, align 4
-  ret void
-}
-
-; Offset too large, can't fold into 16-bit immediate offset.
-; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store:
-; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
-define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
-  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385
-  store i32 %val, i32 addrspace(3)* %gep, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/README b/test/CodeGen/R600/README
deleted file mode 100644
index 96998bba28f..00000000000
--- a/test/CodeGen/R600/README
+++ /dev/null
@@ -1,21 +0,0 @@
-+==============================================================================+
-| How to organize the lit tests                                                |
-+==============================================================================+
-
-- If you write a test for matching a single DAG opcode or intrinsic, it should
-  go in a file called {opcode_name,intrinsic_name}.ll (e.g. fadd.ll)
-
-- If you write a test that matches several DAG opcodes and checks for a single
-  ISA instruction, then that test should go in a file called {ISA_name}.ll (e.g.
-  bfi_int.ll
-
-- For all other tests, use your best judgement for organizing tests and naming
-  the files.
-
-+==============================================================================+
-| Naming conventions                                                           |
-+==============================================================================+
-
-- Use dash '-' and not underscore '_' to separate words in file names, unless
-  the file is named after a DAG opcode or ISA instruction that has an
-  underscore '_' in its name.
diff --git a/test/CodeGen/R600/add-debug.ll b/test/CodeGen/R600/add-debug.ll
deleted file mode 100644
index 529905dd36a..00000000000
--- a/test/CodeGen/R600/add-debug.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=tahiti -debug
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -debug
-; REQUIRES: asserts
-
-; Check that SelectionDAGDumper does not crash on int_SI_if.
-define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
-entry:
-  %0 = icmp eq i64 %a, 0
-  br i1 %0, label %if, label %else
-
-if:
-  %1 = load i64, i64 addrspace(1)* %in
-  br label %endif
-
-else:
-  %2 = add i64 %a, %b
-  br label %endif
-
-endif:
-  %3 = phi i64 [%1, %if], [%2, %else]
-  store i64 %3, i64 addrspace(1)* %out
-  ret void
-}
-
diff --git a/test/CodeGen/R600/add.ll b/test/CodeGen/R600/add.ll
deleted file mode 100644
index 655e75dbc1a..00000000000
--- a/test/CodeGen/R600/add.ll
+++ /dev/null
@@ -1,192 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-
-;FUNC-LABEL: {{^}}test1:
-;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-;SI: v_add_i32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}}
-;SI-NOT: [[REG]]
-;SI: buffer_store_dword [[REG]],
-define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
-  %result = add i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-;FUNC-LABEL: {{^}}test2:
-;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
-  %result = add <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-;FUNC-LABEL: {{^}}test4:
-;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
-  %result = add <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test8:
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
-entry:
-  %0 = add <8 x i32> %a, %b
-  store <8 x i32> %0, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test16:
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_add_i32
-define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
-entry:
-  %0 = add <16 x i32> %a, %b
-  store <16 x i32> %0, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}add64:
-; SI: s_add_u32
-; SI: s_addc_u32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
-; EG-DAG: ADDC_UINT
-; EG-DAG: ADD_INT
-; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
-; EG-NOT: SUB
-define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %0 = add i64 %a, %b
-  store i64 %0, i64 addrspace(1)* %out
-  ret void
-}
-
-; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they
-; use VCC.  The test is designed so that %a will be stored in an SGPR and
-; %0 will be stored in a VGPR, so the comiler will be forced to copy %a
-; to a VGPR before doing the add.
-
-; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
-; SI-NOT: v_addc_u32_e32 s
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
-; EG-DAG: ADDC_UINT
-; EG-DAG: ADD_INT
-; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
-; EG-NOT: SUB
-define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
-entry:
-  %0 = load i64, i64 addrspace(1)* %in
-  %1 = add i64 %a, %0
-  store i64 %1, i64 addrspace(1)* %out
-  ret void
-}
-
-; Test i64 add inside a branch.
-; FUNC-LABEL: {{^}}add64_in_branch:
-; SI: s_add_u32
-; SI: s_addc_u32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
-; EG-DAG: ADDC_UINT
-; EG-DAG: ADD_INT
-; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
-; EG-NOT: SUB
-define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
-entry:
-  %0 = icmp eq i64 %a, 0
-  br i1 %0, label %if, label %else
-
-if:
-  %1 = load i64, i64 addrspace(1)* %in
-  br label %endif
-
-else:
-  %2 = add i64 %a, %b
-  br label %endif
-
-endif:
-  %3 = phi i64 [%1, %if], [%2, %else]
-  store i64 %3, i64 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/add_i64.ll b/test/CodeGen/R600/add_i64.ll
deleted file mode 100644
index 8346add7df9..00000000000
--- a/test/CodeGen/R600/add_i64.ll
+++ /dev/null
@@ -1,84 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-
-
-declare i32 @llvm.r600.read.tidig.x() readnone
-
-; SI-LABEL: {{^}}test_i64_vreg:
-; SI: v_add_i32
-; SI: v_addc_u32
-define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
-  %tid = call i32 @llvm.r600.read.tidig.x() readnone
-  %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
-  %a = load i64, i64 addrspace(1)* %a_ptr
-  %b = load i64, i64 addrspace(1)* %b_ptr
-  %result = add i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-; Check that the SGPR add operand is correctly moved to a VGPR.
-; SI-LABEL: {{^}}sgpr_operand:
-; SI: v_add_i32
-; SI: v_addc_u32
-define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
-  %foo = load i64, i64 addrspace(1)* %in, align 8
-  %result = add i64 %foo, %a
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-; Swap the arguments. Check that the SGPR -> VGPR copy works with the
-; SGPR as other operand.
-;
-; SI-LABEL: {{^}}sgpr_operand_reversed:
-; SI: v_add_i32
-; SI: v_addc_u32
-define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
-  %foo = load i64, i64 addrspace(1)* %in, align 8
-  %result = add i64 %a, %foo
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-
-; SI-LABEL: {{^}}test_v2i64_sreg:
-; SI: s_add_u32
-; SI: s_addc_u32
-; SI: s_add_u32
-; SI: s_addc_u32
-define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
-  %result = add <2 x i64> %a, %b
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}test_v2i64_vreg:
-; SI: v_add_i32
-; SI: v_addc_u32
-; SI: v_add_i32
-; SI: v_addc_u32
-define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
-  %tid = call i32 @llvm.r600.read.tidig.x() readnone
-  %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
-  %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
-  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
-  %result = add <2 x i64> %a, %b
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}trunc_i64_add_to_i32:
-; SI: s_load_dword s[[SREG0:[0-9]+]]
-; SI: s_load_dword s[[SREG1:[0-9]+]]
-; SI: s_add_i32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
-; SI-NOT: addc
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
-define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
-  %add = add i64 %b, %a
-  %trunc = trunc i64 %add to i32
-  store i32 %trunc, i32 addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/address-space.ll b/test/CodeGen/R600/address-space.ll
deleted file mode 100644
index 4be8c584752..00000000000
--- a/test/CodeGen/R600/address-space.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-
-; Test that codegenprepare understands address space sizes
-
-%struct.foo = type { [3 x float], [3 x float] }
-
-; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is
-; already in a VGPR after the first read.
-
-; CHECK-LABEL: {{^}}do_as_ptr_calcs:
-; CHECK: s_load_dword [[SREG1:s[0-9]+]],
-; CHECK: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG1]]
-; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
-; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12
-; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG2]] offset:20
-define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
-entry:
-  %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
-  %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
-  br label %bb32
-
-bb32:
-  %a = load float, float addrspace(3)* %x, align 4
-  %b = load float, float addrspace(3)* %y, align 4
-  %cmp = fcmp one float %a, %b
-  br i1 %cmp, label %bb34, label %bb33
-
-bb33:
-  unreachable
-
-bb34:
-  unreachable
-}
-
-
diff --git a/test/CodeGen/R600/and.ll b/test/CodeGen/R600/and.ll
deleted file mode 100644
index 5672d470bd7..00000000000
--- a/test/CodeGen/R600/and.ll
+++ /dev/null
@@ -1,296 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test2:
-; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
-  %result = and <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test4:
-; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
-  %result = and <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_i32:
-; SI: s_and_b32
-define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-  %and = and i32 %a, %b
-  store i32 %and, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_constant_i32:
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
-define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
-  %and = and i32 %a, 1234567
-  store i32 %and, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_and_i32:
-; SI: v_and_b32
-define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
-  %and = and i32 %a, %b
-  store i32 %and, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_and_constant_i32
-; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
-define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %and = and i32 %a, 1234567
-  store i32 %and, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
-; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
-define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %and = and i32 %a, 64
-  store i32 %and, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
-; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
-define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %and = and i32 %a, -16
-  store i32 %and, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_i64
-; SI: s_and_b64
-define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
-  %and = and i64 %a, %b
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FIXME: Should use SGPRs
-; FUNC-LABEL: {{^}}s_and_i1:
-; SI: v_and_b32
-define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
-  %and = and i1 %a, %b
-  store i1 %and, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_constant_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
-  %and = and i64 %a, 281474976710655
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_and_i64:
-; SI: v_and_b32
-; SI: v_and_b32
-define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
-  %b = load i64, i64 addrspace(1)* %bptr, align 8
-  %and = and i64 %a, %b
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_and_i64_br:
-; SI: v_and_b32
-; SI: v_and_b32
-define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i32 %cond) {
-entry:
-  %tmp0 = icmp eq i32 %cond, 0
-  br i1 %tmp0, label %if, label %endif
-
-if:
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
-  %b = load i64, i64 addrspace(1)* %bptr, align 8
-  %and = and i64 %a, %b
-  br label %endif
-
-endif:
-  %tmp1 = phi i64 [%and, %if], [0, %entry]
-  store i64 %tmp1, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_and_constant_i64:
-; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
-  %and = and i64 %a, 1234567
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FIXME: Replace and 0 with mov 0
-; FUNC-LABEL: {{^}}v_and_inline_imm_i64:
-; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
-; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
-define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
-  %and = and i64 %a, 64
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64
-define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
-  %and = and i64 %a, 64
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1
-define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
-  %and = and i64 %a, 1
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0
-define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
-  %and = and i64 %a, 4607182418800017408
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0
-define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
-  %and = and i64 %a, 13830554455654793216
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5
-define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
-  %and = and i64 %a, 4602678819172646912
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5
-define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
-  %and = and i64 %a, 13826050856027422720
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 2.0
-define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
-  %and = and i64 %a, 4611686018427387904
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -2.0
-define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
-  %and = and i64 %a, 13835058055282163712
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0
-define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
-  %and = and i64 %a, 4616189618054758400
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0
-define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
-  %and = and i64 %a, 13839561654909534208
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-
-; Test with the 64-bit integer bitpattern for a 32-bit float in the
-; low 32-bits, which is not a valid 64-bit inline immmediate.
-
-; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64
-; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0
-; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}}
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
-define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
-  %and = and i64 %a, 1082130432
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FIXME: Copy of -1 register
-; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64
-; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], -4.0
-; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}}
-; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]]
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}
-define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
-  %and = and i64 %a, -1065353216
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; Shift into upper 32-bits
-; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_4.0_i64
-; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0
-; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
-define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
-  %and = and i64 %a, 4647714815446351872
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64
-; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0
-; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
-define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
-  %and = and i64 %a, 13871086852301127680
-  store i64 %and, i64 addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/anyext.ll b/test/CodeGen/R600/anyext.ll
deleted file mode 100644
index 48d8f312249..00000000000
--- a/test/CodeGen/R600/anyext.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; CHECK-LABEL: {{^}}anyext_i1_i32:
-; CHECK: v_cndmask_b32_e64
-define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
-entry:
-  %0 = icmp eq i32 %cond, 0
-  %1 = zext i1 %0 to i8
-  %2 = xor i8 %1, -1
-  %3 = and i8 %2, 1
-  %4 = zext i8 %3 to i32
-  store i32 %4, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll
deleted file mode 100644
index 8c2a0795860..00000000000
--- a/test/CodeGen/R600/array-ptr-calc-i32.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
-
-declare i32 @llvm.SI.tid() nounwind readnone
-declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
-
-; The required pointer calculations for the alloca'd actually requires
-; an add and won't be folded into the addressing, which fails with a
-; 64-bit pointer add. This should work since private pointers should
-; be 32-bits.
-
-; SI-LABEL: {{^}}test_private_array_ptr_calc:
-
-; FIXME: We end up with zero argument for ADD, because
-; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
-; with the appropriate offset.  We should fold this into the store.
-; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}}
-; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}]
-;
-; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
-; alloca to a vector.  It currently fails because it does not know how
-; to interpret:
-; getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
-
-; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], 16
-; SI-PROMOTE: ds_write_b32 [[PTRREG]]
-define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
-  %alloca = alloca [4 x i32], i32 4, align 16
-  %tid = call i32 @llvm.SI.tid() readnone
-  %a_ptr = getelementptr i32, i32 addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid
-  %a = load i32, i32 addrspace(1)* %a_ptr
-  %b = load i32, i32 addrspace(1)* %b_ptr
-  %result = add i32 %a, %b
-  %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
-  store i32 %result, i32* %alloca_ptr, align 4
-  ; Dummy call
-  call void @llvm.AMDGPU.barrier.local() nounwind noduplicate
-  %reload = load i32, i32* %alloca_ptr, align 4
-  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  store i32 %reload, i32 addrspace(1)* %out_ptr, align 4
-  ret void
-}
-
diff --git a/test/CodeGen/R600/array-ptr-calc-i64.ll b/test/CodeGen/R600/array-ptr-calc-i64.ll
deleted file mode 100644
index eae095eb844..00000000000
--- a/test/CodeGen/R600/array-ptr-calc-i64.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare i32 @llvm.SI.tid() readnone
-
-; SI-LABEL: {{^}}test_array_ptr_calc:
-; SI: v_mul_lo_i32
-; SI: v_mul_hi_i32
-define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
-  %tid = call i32 @llvm.SI.tid() readnone
-  %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid
-  %a = load i32, i32 addrspace(1)* %a_ptr
-  %b = load i32, i32 addrspace(1)* %b_ptr
-  %result = add i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/atomic_cmp_swap_local.ll b/test/CodeGen/R600/atomic_cmp_swap_local.ll
deleted file mode 100644
index ef2560ef184..00000000000
--- a/test/CodeGen/R600/atomic_cmp_swap_local.ll
+++ /dev/null
@@ -1,92 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC  %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SICI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
-; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
-; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
-; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
-; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
-  %result = extractvalue { i32, i1 } %pair, 0
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
-; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
-; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
-; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
-; GCN: buffer_store_dwordx2 [[RESULT]],
-; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
-  %result = extractvalue { i64, i1 } %pair, 0
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
-; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
-  %sub = sub i32 %a, %b
-  %add = add i32 %sub, 4
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
-  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
-  %result = extractvalue { i32, i1 } %pair, 0
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
-; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
-; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
-; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
-; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
-; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
-  %result = extractvalue { i32, i1 } %pair, 0
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
-; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
-; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
-  %result = extractvalue { i64, i1 } %pair, 0
-  ret void
-}
diff --git a/test/CodeGen/R600/atomic_load_add.ll b/test/CodeGen/R600/atomic_load_add.ll
deleted file mode 100644
index 20c685447ee..00000000000
--- a/test/CodeGen/R600/atomic_load_add.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}atomic_add_local:
-; R600: LDS_ADD *
-; SI: ds_add_u32
-define void @atomic_add_local(i32 addrspace(3)* %local) {
-   %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
-   ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_add_local_const_offset:
-; R600: LDS_ADD *
-; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
-  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
-  %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_add_ret_local:
-; R600: LDS_ADD_RET *
-; SI: ds_add_rtn_u32
-define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
-  %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
-  store i32 %val, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset:
-; R600: LDS_ADD_RET *
-; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
-  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
-  %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
-  store i32 %val, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/atomic_load_sub.ll b/test/CodeGen/R600/atomic_load_sub.ll
deleted file mode 100644
index 4c6f45525b9..00000000000
--- a/test/CodeGen/R600/atomic_load_sub.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}atomic_sub_local:
-; R600: LDS_SUB *
-; SI: ds_sub_u32
-define void @atomic_sub_local(i32 addrspace(3)* %local) {
-   %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
-   ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_sub_local_const_offset:
-; R600: LDS_SUB *
-; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
-  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
-  %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_sub_ret_local:
-; R600: LDS_SUB_RET *
-; SI: ds_sub_rtn_u32
-define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
-  %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
-  store i32 %val, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset:
-; R600: LDS_SUB_RET *
-; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
-  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
-  %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
-  store i32 %val, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/basic-branch.ll b/test/CodeGen/R600/basic-branch.ll
deleted file mode 100644
index abdc4afef47..00000000000
--- a/test/CodeGen/R600/basic-branch.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; XFAIL: *
-; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK-LABEL: {{^}}test_branch(
-define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
-  %cmp = icmp ne i32 %val, 0
-  br i1 %cmp, label %store, label %end
-
-store:
-  store i32 222, i32 addrspace(1)* %out
-  ret void
-
-end:
-  ret void
-}
diff --git a/test/CodeGen/R600/basic-loop.ll b/test/CodeGen/R600/basic-loop.ll
deleted file mode 100644
index f0263caf5d6..00000000000
--- a/test/CodeGen/R600/basic-loop.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s
-; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
-
-; CHECK-LABEL: {{^}}test_loop:
-define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
-entry:
-  br label %loop.body
-
-loop.body:
-  %i = phi i32 [0, %entry], [%i.inc, %loop.body]
-  store i32 222, i32 addrspace(1)* %out
-  %cmp = icmp ne i32 %i, %val
-  %i.inc = add i32 %i, 1
-  br i1 %cmp, label %loop.body, label %end
-
-end:
-  ret void
-}
diff --git a/test/CodeGen/R600/bfe_uint.ll b/test/CodeGen/R600/bfe_uint.ll
deleted file mode 100644
index 32e3fc26106..00000000000
--- a/test/CodeGen/R600/bfe_uint.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; CHECK: {{^}}bfe_def:
-; CHECK: BFE_UINT
-define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
-entry:
-  %0 = lshr i32 %x, 5
-  %1 = and i32 %0, 15 ; 0xf
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; This program could be implemented using a BFE_UINT instruction, however
-; since the lshr constant + number of bits in the mask is >= 32, it can also be
-; implmented with a LSHR instruction, which is better, because LSHR has less
-; operands and requires less constants.
-
-; CHECK: {{^}}bfe_shift:
-; CHECK-NOT: BFE_UINT
-define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
-entry:
-  %0 = lshr i32 %x, 16
-  %1 = and i32 %0, 65535 ; 0xffff
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll
deleted file mode 100644
index 03349349735..00000000000
--- a/test/CodeGen/R600/bfi_int.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
-
-; BFI_INT Definition pattern from ISA docs
-; (y & x) | (z & ~x)
-;
-; R600: {{^}}bfi_def:
-; R600: BFI_INT
-; SI:   @bfi_def
-; SI:   v_bfi_b32
-define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
-entry:
-  %0 = xor i32 %x, -1
-  %1 = and i32 %z, %0
-  %2 = and i32 %y, %x
-  %3 = or i32 %1, %2
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
-
-; SHA-256 Ch function
-; z ^ (x & (y ^ z))
-; R600: {{^}}bfi_sha256_ch:
-; R600: BFI_INT
-; SI:   @bfi_sha256_ch
-; SI:   v_bfi_b32
-define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
-entry:
-  %0 = xor i32 %y, %z
-  %1 = and i32 %x, %0
-  %2 = xor i32 %z, %1
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; SHA-256 Ma function
-; ((x & z) | (y & (x | z)))
-; R600: {{^}}bfi_sha256_ma:
-; R600: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W
-; R600: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W
-; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
-; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
-
-define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
-entry:
-  %0 = and i32 %x, %z
-  %1 = or i32 %x, %z
-  %2 = and i32 %y, %1
-  %3 = or i32 %0, %2
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/big_alu.ll b/test/CodeGen/R600/big_alu.ll
deleted file mode 100644
index 2671c5d102b..00000000000
--- a/test/CodeGen/R600/big_alu.ll
+++ /dev/null
@@ -1,1173 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=cedar
-
-;This test ensures that R600 backend can handle ifcvt properly
-;and do not generate ALU clauses with more than 128 instructions.
-
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #0 {
-main_body:
-  %0 = extractelement <4 x float> %reg0, i32 0
-  %1 = extractelement <4 x float> %reg0, i32 1
-  %2 = extractelement <4 x float> %reg0, i32 2
-  %3 = extractelement <4 x float> %reg0, i32 3
-  %4 = extractelement <4 x float> %reg1, i32 0
-  %5 = extractelement <4 x float> %reg9, i32 0
-  %6 = extractelement <4 x float> %reg8, i32 0
-  %7 = fcmp ugt float %6, 0.000000e+00
-  %8 = select i1 %7, float %4, float %5
-  %9 = extractelement <4 x float> %reg1, i32 1
-  %10 = extractelement <4 x float> %reg9, i32 1
-  %11 = extractelement <4 x float> %reg8, i32 0
-  %12 = fcmp ugt float %11, 0.000000e+00
-  %13 = select i1 %12, float %9, float %10
-  %14 = extractelement <4 x float> %reg1, i32 2
-  %15 = extractelement <4 x float> %reg9, i32 2
-  %16 = extractelement <4 x float> %reg8, i32 0
-  %17 = fcmp ugt float %16, 0.000000e+00
-  %18 = select i1 %17, float %14, float %15
-  %19 = extractelement <4 x float> %reg1, i32 3
-  %20 = extractelement <4 x float> %reg9, i32 3
-  %21 = extractelement <4 x float> %reg8, i32 0
-  %22 = extractelement <4 x float> %reg2, i32 0
-  %23 = extractelement <4 x float> %reg2, i32 1
-  %24 = extractelement <4 x float> %reg2, i32 2
-  %25 = extractelement <4 x float> %reg2, i32 3
-  %26 = extractelement <4 x float> %reg3, i32 0
-  %27 = extractelement <4 x float> %reg3, i32 1
-  %28 = extractelement <4 x float> %reg3, i32 2
-  %29 = extractelement <4 x float> %reg3, i32 3
-  %30 = extractelement <4 x float> %reg4, i32 0
-  %31 = extractelement <4 x float> %reg4, i32 1
-  %32 = extractelement <4 x float> %reg4, i32 2
-  %33 = extractelement <4 x float> %reg4, i32 3
-  %34 = extractelement <4 x float> %reg5, i32 0
-  %35 = extractelement <4 x float> %reg5, i32 1
-  %36 = extractelement <4 x float> %reg5, i32 2
-  %37 = extractelement <4 x float> %reg5, i32 3
-  %38 = extractelement <4 x float> %reg6, i32 0
-  %39 = extractelement <4 x float> %reg6, i32 1
-  %40 = extractelement <4 x float> %reg6, i32 2
-  %41 = extractelement <4 x float> %reg6, i32 3
-  %42 = extractelement <4 x float> %reg7, i32 0
-  %43 = extractelement <4 x float> %reg7, i32 1
-  %44 = extractelement <4 x float> %reg7, i32 2
-  %45 = extractelement <4 x float> %reg7, i32 3
-  %46 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
-  %47 = extractelement <4 x float> %46, i32 0
-  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
-  %49 = extractelement <4 x float> %48, i32 1
-  %50 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
-  %51 = extractelement <4 x float> %50, i32 2
-  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
-  %53 = extractelement <4 x float> %52, i32 0
-  %54 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
-  %55 = extractelement <4 x float> %54, i32 0
-  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
-  %57 = extractelement <4 x float> %56, i32 1
-  %58 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
-  %59 = extractelement <4 x float> %58, i32 2
-  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
-  %61 = extractelement <4 x float> %60, i32 3
-  %62 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
-  %63 = extractelement <4 x float> %62, i32 0
-  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
-  %65 = extractelement <4 x float> %64, i32 1
-  %66 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
-  %67 = extractelement <4 x float> %66, i32 2
-  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %69 = extractelement <4 x float> %68, i32 0
-  %70 = fcmp oge float %69, 3.500000e+00
-  %71 = sext i1 %70 to i32
-  %72 = bitcast i32 %71 to float
-  %73 = bitcast float %72 to i32
-  %74 = icmp ne i32 %73, 0
-  %. = select i1 %74, float 0.000000e+00, float 0.000000e+00
-  %75 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %76 = extractelement <4 x float> %75, i32 0
-  %77 = fcmp oge float %76, 2.000000e+00
-  %78 = sext i1 %77 to i32
-  %79 = bitcast i32 %78 to float
-  %80 = bitcast float %79 to i32
-  %81 = icmp ne i32 %80, 0
-  br i1 %81, label %IF137, label %ENDIF136
-
-IF137:                                            ; preds = %main_body
-  %82 = insertelement <4 x float> undef, float %30, i32 0
-  %83 = insertelement <4 x float> %82, float %31, i32 1
-  %84 = insertelement <4 x float> %83, float %32, i32 2
-  %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
-  %86 = insertelement <4 x float> undef, float %30, i32 0
-  %87 = insertelement <4 x float> %86, float %31, i32 1
-  %88 = insertelement <4 x float> %87, float %32, i32 2
-  %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
-  %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89)
-  %91 = call float @llvm.AMDGPU.rsq.f32(float %90)
-  %92 = fmul float %30, %91
-  %93 = fmul float %31, %91
-  %94 = fmul float %32, %91
-  %95 = insertelement <4 x float> undef, float %92, i32 0
-  %96 = insertelement <4 x float> %95, float %93, i32 1
-  %97 = insertelement <4 x float> %96, float %94, i32 2
-  %98 = insertelement <4 x float> %97, float 0.000000e+00, i32 3
-  %99 = insertelement <4 x float> undef, float %37, i32 0
-  %100 = insertelement <4 x float> %99, float %38, i32 1
-  %101 = insertelement <4 x float> %100, float %39, i32 2
-  %102 = insertelement <4 x float> %101, float 0.000000e+00, i32 3
-  %103 = call float @llvm.AMDGPU.dp4(<4 x float> %98, <4 x float> %102)
-  %104 = insertelement <4 x float> undef, float %92, i32 0
-  %105 = insertelement <4 x float> %104, float %93, i32 1
-  %106 = insertelement <4 x float> %105, float %94, i32 2
-  %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 3
-  %108 = insertelement <4 x float> undef, float %40, i32 0
-  %109 = insertelement <4 x float> %108, float %41, i32 1
-  %110 = insertelement <4 x float> %109, float %42, i32 2
-  %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 3
-  %112 = call float @llvm.AMDGPU.dp4(<4 x float> %107, <4 x float> %111)
-  %113 = fsub float -0.000000e+00, %92
-  %114 = fsub float -0.000000e+00, %93
-  %115 = fsub float -0.000000e+00, %94
-  %116 = insertelement <4 x float> undef, float %34, i32 0
-  %117 = insertelement <4 x float> %116, float %35, i32 1
-  %118 = insertelement <4 x float> %117, float %36, i32 2
-  %119 = insertelement <4 x float> %118, float 0.000000e+00, i32 3
-  %120 = insertelement <4 x float> undef, float %113, i32 0
-  %121 = insertelement <4 x float> %120, float %114, i32 1
-  %122 = insertelement <4 x float> %121, float %115, i32 2
-  %123 = insertelement <4 x float> %122, float 0.000000e+00, i32 3
-  %124 = call float @llvm.AMDGPU.dp4(<4 x float> %119, <4 x float> %123)
-  %125 = fdiv float 1.000000e+00, %124
-  %126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %127 = extractelement <4 x float> %126, i32 0
-  %128 = fmul float %127, %125
-  %129 = fmul float %103, %128
-  %130 = fmul float %112, %128
-  %131 = bitcast float %. to i32
-  %132 = sitofp i32 %131 to float
-  %133 = fdiv float 1.000000e+00, %132
-  %134 = bitcast float %. to i32
-  %135 = add i32 %134, -1
-  %136 = bitcast i32 %135 to float
-  %137 = bitcast float %136 to i32
-  br label %LOOP
-
-ENDIF136:                                         ; preds = %main_body, %ENDIF154
-  %temp68.1 = phi float [ %600, %ENDIF154 ], [ 0.000000e+00, %main_body ]
-  %temp69.0 = phi float [ %602, %ENDIF154 ], [ 0.000000e+00, %main_body ]
-  %temp70.0 = phi float [ %604, %ENDIF154 ], [ 1.000000e+00, %main_body ]
-  %138 = fmul float %26, 0x3F847AE140000000
-  %139 = fmul float %27, 0x3F847AE140000000
-  %140 = fmul float %28, 0x3F847AE140000000
-  %141 = insertelement <4 x float> undef, float %138, i32 0
-  %142 = insertelement <4 x float> %141, float %139, i32 1
-  %143 = insertelement <4 x float> %142, float %140, i32 2
-  %144 = insertelement <4 x float> %143, float 0.000000e+00, i32 3
-  %145 = extractelement <4 x float> %144, i32 0
-  %146 = extractelement <4 x float> %144, i32 1
-  %147 = extractelement <4 x float> %144, i32 2
-  %148 = extractelement <4 x float> %144, i32 3
-  %149 = insertelement <4 x float> undef, float %145, i32 0
-  %150 = insertelement <4 x float> %149, float %146, i32 1
-  %151 = insertelement <4 x float> %150, float %147, i32 2
-  %152 = insertelement <4 x float> %151, float %148, i32 3
-  %153 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %152, i32 16, i32 0, i32 3)
-  %154 = extractelement <4 x float> %153, i32 0
-  %155 = extractelement <4 x float> %153, i32 1
-  %156 = extractelement <4 x float> %153, i32 2
-  %157 = extractelement <4 x float> %153, i32 3
-  %158 = fmul float %26, 0x3F45A07B40000000
-  %159 = fmul float %27, 0x3F45A07B40000000
-  %160 = fmul float %28, 0x3F45A07B40000000
-  %161 = insertelement <4 x float> undef, float %158, i32 0
-  %162 = insertelement <4 x float> %161, float %159, i32 1
-  %163 = insertelement <4 x float> %162, float %160, i32 2
-  %164 = insertelement <4 x float> %163, float 0.000000e+00, i32 3
-  %165 = extractelement <4 x float> %164, i32 0
-  %166 = extractelement <4 x float> %164, i32 1
-  %167 = extractelement <4 x float> %164, i32 2
-  %168 = extractelement <4 x float> %164, i32 3
-  %169 = insertelement <4 x float> undef, float %165, i32 0
-  %170 = insertelement <4 x float> %169, float %166, i32 1
-  %171 = insertelement <4 x float> %170, float %167, i32 2
-  %172 = insertelement <4 x float> %171, float %168, i32 3
-  %173 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %172, i32 16, i32 0, i32 3)
-  %174 = extractelement <4 x float> %173, i32 0
-  %175 = extractelement <4 x float> %173, i32 1
-  %176 = extractelement <4 x float> %173, i32 2
-  %177 = extractelement <4 x float> %173, i32 3
-  %178 = fmul float %176, 3.000000e+03
-  %179 = fadd float %178, %28
-  %180 = fdiv float 1.000000e+00, %33
-  %181 = fmul float %32, %180
-  %182 = call float @fabs(float %181)
-  %183 = fmul float %174, 0x3FD99999A0000000
-  %184 = fadd float %183, 0x3FAEB851E0000000
-  %185 = fmul float %175, 0x3FE3333340000000
-  %186 = fadd float %185, %184
-  %187 = fmul float %176, 2.000000e+00
-  %188 = fadd float %187, %186
-  %189 = fmul float %177, 4.000000e+00
-  %190 = fadd float %189, %188
-  %191 = fmul float %154, 0x3FB99999A0000000
-  %192 = fadd float %191, %190
-  %193 = fmul float %155, 0x3FD99999A0000000
-  %194 = fadd float %193, %192
-  %195 = fmul float %156, 0x3FE99999A0000000
-  %196 = fadd float %195, %194
-  %197 = fmul float %157, 0x4000CCCCC0000000
-  %198 = fadd float %197, %196
-  %199 = fmul float 0xBE5EFB4CC0000000, %182
-  %200 = fmul float %199, %182
-  %201 = call float @llvm.AMDIL.exp.(float %200)
-  %202 = call float @llvm.AMDGPU.lrp(float %201, float %198, float 0x3FA99999A0000000)
-  %203 = fadd float %202, 0x3FF4CCCCC0000000
-  %204 = fmul float %203, 0x3FE1C71C80000000
-  %205 = call float @llvm.AMDIL.clamp.(float %204, float 0.000000e+00, float 1.000000e+00)
-  %206 = fadd float %202, 0x3FF4CCCCC0000000
-  %207 = fmul float %206, 0x3FE1C71C80000000
-  %208 = call float @llvm.AMDIL.clamp.(float %207, float 0.000000e+00, float 1.000000e+00)
-  %209 = fadd float %202, 2.000000e+00
-  %210 = fmul float %209, 0x3FD611A7A0000000
-  %211 = call float @llvm.AMDIL.clamp.(float %210, float 0.000000e+00, float 1.000000e+00)
-  %212 = fmul float 2.000000e+00, %205
-  %213 = fsub float -0.000000e+00, %212
-  %214 = fadd float 3.000000e+00, %213
-  %215 = fmul float %205, %214
-  %216 = fmul float %205, %215
-  %217 = fmul float 2.000000e+00, %208
-  %218 = fsub float -0.000000e+00, %217
-  %219 = fadd float 3.000000e+00, %218
-  %220 = fmul float %208, %219
-  %221 = fmul float %208, %220
-  %222 = fmul float 2.000000e+00, %211
-  %223 = fsub float -0.000000e+00, %222
-  %224 = fadd float 3.000000e+00, %223
-  %225 = fmul float %211, %224
-  %226 = fmul float %211, %225
-  %227 = fmul float %26, 0x3F368B5CC0000000
-  %228 = fmul float %27, 0x3F368B5CC0000000
-  %229 = insertelement <4 x float> undef, float %227, i32 0
-  %230 = insertelement <4 x float> %229, float %228, i32 1
-  %231 = insertelement <4 x float> %230, float 0.000000e+00, i32 2
-  %232 = insertelement <4 x float> %231, float 0.000000e+00, i32 3
-  %233 = extractelement <4 x float> %232, i32 0
-  %234 = extractelement <4 x float> %232, i32 1
-  %235 = insertelement <4 x float> undef, float %233, i32 0
-  %236 = insertelement <4 x float> %235, float %234, i32 1
-  %237 = insertelement <4 x float> %236, float undef, i32 2
-  %238 = insertelement <4 x float> %237, float undef, i32 3
-  %239 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %238, i32 17, i32 1, i32 2)
-  %240 = extractelement <4 x float> %239, i32 0
-  %241 = insertelement <4 x float> undef, float %240, i32 0
-  %242 = insertelement <4 x float> %241, float %228, i32 1
-  %243 = insertelement <4 x float> %242, float 0.000000e+00, i32 2
-  %244 = insertelement <4 x float> %243, float 0.000000e+00, i32 3
-  %245 = extractelement <4 x float> %244, i32 0
-  %246 = insertelement <4 x float> undef, float %245, i32 0
-  %247 = insertelement <4 x float> %246, float undef, i32 1
-  %248 = insertelement <4 x float> %247, float undef, i32 2
-  %249 = insertelement <4 x float> %248, float undef, i32 3
-  %250 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %249, i32 18, i32 2, i32 1)
-  %251 = extractelement <4 x float> %250, i32 0
-  %252 = extractelement <4 x float> %250, i32 1
-  %253 = extractelement <4 x float> %250, i32 2
-  %254 = extractelement <4 x float> %250, i32 3
-  %255 = fmul float %251, %216
-  %256 = fmul float %252, %221
-  %257 = fmul float %253, %226
-  %258 = fmul float %254, 0.000000e+00
-  %259 = fadd float %202, 0x3FF4CCCCC0000000
-  %260 = fmul float %259, 0x3FE1C71C80000000
-  %261 = call float @llvm.AMDIL.clamp.(float %260, float 0.000000e+00, float 1.000000e+00)
-  %262 = fadd float %202, 0x3FF4CCCCC0000000
-  %263 = fmul float %262, 0x3FE1C71C80000000
-  %264 = call float @llvm.AMDIL.clamp.(float %263, float 0.000000e+00, float 1.000000e+00)
-  %265 = fadd float %202, 2.000000e+00
-  %266 = fmul float %265, 0x3FD611A7A0000000
-  %267 = call float @llvm.AMDIL.clamp.(float %266, float 0.000000e+00, float 1.000000e+00)
-  %268 = fmul float 2.000000e+00, %261
-  %269 = fsub float -0.000000e+00, %268
-  %270 = fadd float 3.000000e+00, %269
-  %271 = fmul float %261, %270
-  %272 = fmul float %261, %271
-  %273 = fmul float 2.000000e+00, %264
-  %274 = fsub float -0.000000e+00, %273
-  %275 = fadd float 3.000000e+00, %274
-  %276 = fmul float %264, %275
-  %277 = fmul float %264, %276
-  %278 = fmul float 2.000000e+00, %267
-  %279 = fsub float -0.000000e+00, %278
-  %280 = fadd float 3.000000e+00, %279
-  %281 = fmul float %267, %280
-  %282 = fmul float %267, %281
-  %283 = fmul float %26, 0x3F22DFD6A0000000
-  %284 = fmul float %27, 0x3F22DFD6A0000000
-  %285 = insertelement <4 x float> undef, float %283, i32 0
-  %286 = insertelement <4 x float> %285, float %284, i32 1
-  %287 = insertelement <4 x float> %286, float 0.000000e+00, i32 2
-  %288 = insertelement <4 x float> %287, float 0.000000e+00, i32 3
-  %289 = extractelement <4 x float> %288, i32 0
-  %290 = extractelement <4 x float> %288, i32 1
-  %291 = insertelement <4 x float> undef, float %289, i32 0
-  %292 = insertelement <4 x float> %291, float %290, i32 1
-  %293 = insertelement <4 x float> %292, float undef, i32 2
-  %294 = insertelement <4 x float> %293, float undef, i32 3
-  %295 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %294, i32 19, i32 3, i32 2)
-  %296 = extractelement <4 x float> %295, i32 0
-  %297 = extractelement <4 x float> %295, i32 1
-  %298 = extractelement <4 x float> %295, i32 2
-  %299 = extractelement <4 x float> %295, i32 3
-  %300 = fmul float %296, %272
-  %301 = fmul float %297, %277
-  %302 = fmul float %298, %282
-  %303 = fmul float %299, 0.000000e+00
-  %304 = fmul float %temp68.1, %37
-  %305 = fmul float %temp68.1, %38
-  %306 = fmul float %temp68.1, %39
-  %307 = fmul float %temp69.0, %40
-  %308 = fadd float %307, %304
-  %309 = fmul float %temp69.0, %41
-  %310 = fadd float %309, %305
-  %311 = fmul float %temp69.0, %42
-  %312 = fadd float %311, %306
-  %313 = fmul float %temp70.0, %34
-  %314 = fadd float %313, %308
-  %315 = fmul float %temp70.0, %35
-  %316 = fadd float %315, %310
-  %317 = fmul float %temp70.0, %36
-  %318 = fadd float %317, %312
-  %319 = insertelement <4 x float> undef, float %314, i32 0
-  %320 = insertelement <4 x float> %319, float %316, i32 1
-  %321 = insertelement <4 x float> %320, float %318, i32 2
-  %322 = insertelement <4 x float> %321, float 0.000000e+00, i32 3
-  %323 = insertelement <4 x float> undef, float %314, i32 0
-  %324 = insertelement <4 x float> %323, float %316, i32 1
-  %325 = insertelement <4 x float> %324, float %318, i32 2
-  %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3
-  %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326)
-  %328 = call float @llvm.AMDGPU.rsq.f32(float %327)
-  %329 = fmul float %314, %328
-  %330 = fmul float %316, %328
-  %331 = fmul float %318, %328
-  %332 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %333 = extractelement <4 x float> %332, i32 0
-  %334 = fsub float -0.000000e+00, %333
-  %335 = fadd float 1.000000e+00, %334
-  %336 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %337 = extractelement <4 x float> %336, i32 0
-  %338 = fsub float -0.000000e+00, %337
-  %339 = fadd float 1.000000e+00, %338
-  %340 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %341 = extractelement <4 x float> %340, i32 0
-  %342 = fsub float -0.000000e+00, %341
-  %343 = fadd float 1.000000e+00, %342
-  %344 = fsub float -0.000000e+00, %335
-  %345 = fadd float %202, %344
-  %346 = fsub float -0.000000e+00, %339
-  %347 = fadd float %202, %346
-  %348 = fadd float %347, 0xBFE3333340000000
-  %349 = fsub float -0.000000e+00, %202
-  %350 = fsub float -0.000000e+00, %343
-  %351 = fadd float %349, %350
-  %352 = insertelement <4 x float> undef, float %43, i32 0
-  %353 = insertelement <4 x float> %352, float %44, i32 1
-  %354 = insertelement <4 x float> %353, float %45, i32 2
-  %355 = insertelement <4 x float> %354, float 0.000000e+00, i32 3
-  %356 = insertelement <4 x float> undef, float %43, i32 0
-  %357 = insertelement <4 x float> %356, float %44, i32 1
-  %358 = insertelement <4 x float> %357, float %45, i32 2
-  %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3
-  %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359)
-  %361 = call float @llvm.AMDGPU.rsq.f32(float %360)
-  %362 = fmul float %45, %361
-  %363 = call float @fabs(float %362)
-  %364 = fmul float %176, 0x3FECCCCCC0000000
-  %365 = fadd float %364, %363
-  %366 = fadd float %365, 0xBFEFAE1480000000
-  %367 = fmul float %366, 0xC023FFFFC0000000
-  %368 = call float @llvm.AMDIL.clamp.(float %367, float 0.000000e+00, float 1.000000e+00)
-  %369 = fsub float -0.000000e+00, %335
-  %370 = fadd float %202, %369
-  %371 = fadd float %370, 0x3FBEB851E0000000
-  %372 = fsub float -0.000000e+00, %339
-  %373 = fadd float %202, %372
-  %374 = fadd float %373, 0xBFE0A3D700000000
-  %375 = fsub float -0.000000e+00, %202
-  %376 = fsub float -0.000000e+00, %343
-  %377 = fadd float %375, %376
-  %378 = insertelement <4 x float> undef, float %43, i32 0
-  %379 = insertelement <4 x float> %378, float %44, i32 1
-  %380 = insertelement <4 x float> %379, float %45, i32 2
-  %381 = insertelement <4 x float> %380, float 0.000000e+00, i32 3
-  %382 = insertelement <4 x float> undef, float %43, i32 0
-  %383 = insertelement <4 x float> %382, float %44, i32 1
-  %384 = insertelement <4 x float> %383, float %45, i32 2
-  %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3
-  %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385)
-  %387 = call float @llvm.AMDGPU.rsq.f32(float %386)
-  %388 = fmul float %45, %387
-  %389 = call float @fabs(float %388)
-  %390 = fmul float %176, 0x3FF51EB860000000
-  %391 = fadd float %390, %389
-  %392 = fadd float %391, 0xBFEFAE1480000000
-  %393 = fmul float %392, 0xC0490001A0000000
-  %394 = call float @llvm.AMDIL.clamp.(float %393, float 0.000000e+00, float 1.000000e+00)
-  %395 = fmul float 2.000000e+00, %368
-  %396 = fsub float -0.000000e+00, %395
-  %397 = fadd float 3.000000e+00, %396
-  %398 = fmul float %368, %397
-  %399 = fmul float %368, %398
-  %400 = call float @llvm.AMDGPU.lrp(float %399, float %255, float %345)
-  %401 = call float @llvm.AMDGPU.lrp(float %399, float %256, float %348)
-  %402 = call float @llvm.AMDGPU.lrp(float %399, float %257, float %351)
-  %403 = call float @llvm.AMDGPU.lrp(float %399, float %258, float 0.000000e+00)
-  %404 = fmul float 2.000000e+00, %394
-  %405 = fsub float -0.000000e+00, %404
-  %406 = fadd float 3.000000e+00, %405
-  %407 = fmul float %394, %406
-  %408 = fmul float %394, %407
-  %409 = call float @llvm.AMDGPU.lrp(float %408, float %255, float %371)
-  %410 = call float @llvm.AMDGPU.lrp(float %408, float %256, float %374)
-  %411 = call float @llvm.AMDGPU.lrp(float %408, float %257, float %377)
-  %412 = call float @llvm.AMDGPU.lrp(float %408, float %258, float 0x3FD3333340000000)
-  %413 = fcmp oge float 2.200000e+03, %179
-  %414 = sext i1 %413 to i32
-  %415 = bitcast i32 %414 to float
-  %416 = bitcast float %415 to i32
-  %417 = icmp ne i32 %416, 0
-  br i1 %417, label %IF161, label %ENDIF160
-
-LOOP:                                             ; preds = %ENDIF139, %IF137
-  %temp88.0 = phi float [ 0.000000e+00, %IF137 ], [ %446, %ENDIF139 ]
-  %temp92.0 = phi float [ 1.000000e+00, %IF137 ], [ %.temp92.0, %ENDIF139 ]
-  %temp96.0 = phi float [ 0.000000e+00, %IF137 ], [ %477, %ENDIF139 ]
-  %418 = bitcast float %temp96.0 to i32
-  %419 = icmp sge i32 %418, %137
-  %420 = sext i1 %419 to i32
-  %421 = bitcast i32 %420 to float
-  %422 = bitcast float %421 to i32
-  %423 = icmp ne i32 %422, 0
-  br i1 %423, label %IF140, label %ENDIF139
-
-IF140:                                            ; preds = %LOOP
-  %424 = fmul float %133, 5.000000e-01
-  %425 = fmul float %129, %temp92.0
-  %426 = fadd float %425, %22
-  %427 = fmul float %130, %temp92.0
-  %428 = fadd float %427, %23
-  %429 = insertelement <4 x float> undef, float %426, i32 0
-  %430 = insertelement <4 x float> %429, float %428, i32 1
-  %431 = insertelement <4 x float> %430, float 0.000000e+00, i32 2
-  %432 = insertelement <4 x float> %431, float 0.000000e+00, i32 3
-  %433 = extractelement <4 x float> %432, i32 0
-  %434 = extractelement <4 x float> %432, i32 1
-  %435 = insertelement <4 x float> undef, float %433, i32 0
-  %436 = insertelement <4 x float> %435, float %434, i32 1
-  %437 = insertelement <4 x float> %436, float undef, i32 2
-  %438 = insertelement <4 x float> %437, float undef, i32 3
-  %439 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %438, i32 20, i32 4, i32 2)
-  %440 = extractelement <4 x float> %439, i32 3
-  %441 = fcmp oge float %temp92.0, %440
-  %442 = sext i1 %441 to i32
-  %443 = bitcast i32 %442 to float
-  %444 = bitcast float %443 to i32
-  %445 = icmp ne i32 %444, 0
-  br i1 %445, label %IF146, label %ENDIF145
-
-ENDIF139:                                         ; preds = %LOOP
-  %446 = fadd float %temp88.0, %133
-  %447 = fmul float %129, %446
-  %448 = fadd float %447, %22
-  %449 = fmul float %130, %446
-  %450 = fadd float %449, %23
-  %451 = insertelement <4 x float> undef, float %448, i32 0
-  %452 = insertelement <4 x float> %451, float %450, i32 1
-  %453 = insertelement <4 x float> %452, float 0.000000e+00, i32 2
-  %454 = insertelement <4 x float> %453, float 0.000000e+00, i32 3
-  %455 = extractelement <4 x float> %454, i32 0
-  %456 = extractelement <4 x float> %454, i32 1
-  %457 = insertelement <4 x float> undef, float %455, i32 0
-  %458 = insertelement <4 x float> %457, float %456, i32 1
-  %459 = insertelement <4 x float> %458, float undef, i32 2
-  %460 = insertelement <4 x float> %459, float undef, i32 3
-  %461 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %460, i32 20, i32 4, i32 2)
-  %462 = extractelement <4 x float> %461, i32 3
-  %463 = fcmp olt float 0x3FEFDF3B60000000, %temp92.0
-  %464 = sext i1 %463 to i32
-  %465 = bitcast i32 %464 to float
-  %466 = fcmp oge float %446, %462
-  %467 = sext i1 %466 to i32
-  %468 = bitcast i32 %467 to float
-  %469 = bitcast float %465 to i32
-  %470 = bitcast float %468 to i32
-  %471 = and i32 %469, %470
-  %472 = bitcast i32 %471 to float
-  %473 = bitcast float %472 to i32
-  %474 = icmp ne i32 %473, 0
-  %.temp92.0 = select i1 %474, float %446, float %temp92.0
-  %475 = bitcast float %temp96.0 to i32
-  %476 = add i32 %475, 1
-  %477 = bitcast i32 %476 to float
-  br label %LOOP
-
-IF146:                                            ; preds = %IF140
-  %478 = fmul float 2.000000e+00, %424
-  %479 = fsub float -0.000000e+00, %478
-  %480 = fadd float %temp92.0, %479
-  br label %ENDIF145
-
-ENDIF145:                                         ; preds = %IF140, %IF146
-  %temp88.1 = phi float [ %480, %IF146 ], [ %temp92.0, %IF140 ]
-  %481 = fadd float %temp88.1, %424
-  %482 = fmul float %424, 5.000000e-01
-  %483 = fmul float %129, %481
-  %484 = fadd float %483, %22
-  %485 = fmul float %130, %481
-  %486 = fadd float %485, %23
-  %487 = insertelement <4 x float> undef, float %484, i32 0
-  %488 = insertelement <4 x float> %487, float %486, i32 1
-  %489 = insertelement <4 x float> %488, float 0.000000e+00, i32 2
-  %490 = insertelement <4 x float> %489, float %440, i32 3
-  %491 = extractelement <4 x float> %490, i32 0
-  %492 = extractelement <4 x float> %490, i32 1
-  %493 = insertelement <4 x float> undef, float %491, i32 0
-  %494 = insertelement <4 x float> %493, float %492, i32 1
-  %495 = insertelement <4 x float> %494, float undef, i32 2
-  %496 = insertelement <4 x float> %495, float undef, i32 3
-  %497 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %496, i32 20, i32 4, i32 2)
-  %498 = extractelement <4 x float> %497, i32 3
-  %499 = fcmp oge float %481, %498
-  %500 = sext i1 %499 to i32
-  %501 = bitcast i32 %500 to float
-  %502 = bitcast float %501 to i32
-  %503 = icmp ne i32 %502, 0
-  br i1 %503, label %IF149, label %ENDIF148
-
-IF149:                                            ; preds = %ENDIF145
-  %504 = fmul float 2.000000e+00, %482
-  %505 = fsub float -0.000000e+00, %504
-  %506 = fadd float %481, %505
-  br label %ENDIF148
-
-ENDIF148:                                         ; preds = %ENDIF145, %IF149
-  %temp88.2 = phi float [ %506, %IF149 ], [ %481, %ENDIF145 ]
-  %temp92.2 = phi float [ %481, %IF149 ], [ %temp92.0, %ENDIF145 ]
-  %507 = fadd float %temp88.2, %482
-  %508 = fmul float %482, 5.000000e-01
-  %509 = fmul float %129, %507
-  %510 = fadd float %509, %22
-  %511 = fmul float %130, %507
-  %512 = fadd float %511, %23
-  %513 = insertelement <4 x float> undef, float %510, i32 0
-  %514 = insertelement <4 x float> %513, float %512, i32 1
-  %515 = insertelement <4 x float> %514, float 0.000000e+00, i32 2
-  %516 = insertelement <4 x float> %515, float %498, i32 3
-  %517 = extractelement <4 x float> %516, i32 0
-  %518 = extractelement <4 x float> %516, i32 1
-  %519 = insertelement <4 x float> undef, float %517, i32 0
-  %520 = insertelement <4 x float> %519, float %518, i32 1
-  %521 = insertelement <4 x float> %520, float undef, i32 2
-  %522 = insertelement <4 x float> %521, float undef, i32 3
-  %523 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %522, i32 20, i32 4, i32 2)
-  %524 = extractelement <4 x float> %523, i32 3
-  %525 = fcmp oge float %507, %524
-  %526 = sext i1 %525 to i32
-  %527 = bitcast i32 %526 to float
-  %528 = bitcast float %527 to i32
-  %529 = icmp ne i32 %528, 0
-  br i1 %529, label %IF152, label %ENDIF151
-
-IF152:                                            ; preds = %ENDIF148
-  %530 = fmul float 2.000000e+00, %508
-  %531 = fsub float -0.000000e+00, %530
-  %532 = fadd float %507, %531
-  br label %ENDIF151
-
-ENDIF151:                                         ; preds = %ENDIF148, %IF152
-  %temp88.3 = phi float [ %532, %IF152 ], [ %507, %ENDIF148 ]
-  %temp92.3 = phi float [ %507, %IF152 ], [ %temp92.2, %ENDIF148 ]
-  %533 = fadd float %temp88.3, %508
-  %534 = fmul float %508, 5.000000e-01
-  %535 = fmul float %129, %533
-  %536 = fadd float %535, %22
-  %537 = fmul float %130, %533
-  %538 = fadd float %537, %23
-  %539 = insertelement <4 x float> undef, float %536, i32 0
-  %540 = insertelement <4 x float> %539, float %538, i32 1
-  %541 = insertelement <4 x float> %540, float 0.000000e+00, i32 2
-  %542 = insertelement <4 x float> %541, float %524, i32 3
-  %543 = extractelement <4 x float> %542, i32 0
-  %544 = extractelement <4 x float> %542, i32 1
-  %545 = insertelement <4 x float> undef, float %543, i32 0
-  %546 = insertelement <4 x float> %545, float %544, i32 1
-  %547 = insertelement <4 x float> %546, float undef, i32 2
-  %548 = insertelement <4 x float> %547, float undef, i32 3
-  %549 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %548, i32 20, i32 4, i32 2)
-  %550 = extractelement <4 x float> %549, i32 3
-  %551 = fcmp oge float %533, %550
-  %552 = sext i1 %551 to i32
-  %553 = bitcast i32 %552 to float
-  %554 = bitcast float %553 to i32
-  %555 = icmp ne i32 %554, 0
-  br i1 %555, label %IF155, label %ENDIF154
-
-IF155:                                            ; preds = %ENDIF151
-  %556 = fmul float 2.000000e+00, %534
-  %557 = fsub float -0.000000e+00, %556
-  %558 = fadd float %533, %557
-  br label %ENDIF154
-
-ENDIF154:                                         ; preds = %ENDIF151, %IF155
-  %temp88.4 = phi float [ %558, %IF155 ], [ %533, %ENDIF151 ]
-  %temp92.4 = phi float [ %533, %IF155 ], [ %temp92.3, %ENDIF151 ]
-  %559 = fadd float %temp88.4, %534
-  %560 = fmul float %129, %559
-  %561 = fadd float %560, %22
-  %562 = fmul float %130, %559
-  %563 = fadd float %562, %23
-  %564 = insertelement <4 x float> undef, float %561, i32 0
-  %565 = insertelement <4 x float> %564, float %563, i32 1
-  %566 = insertelement <4 x float> %565, float 0.000000e+00, i32 2
-  %567 = insertelement <4 x float> %566, float %550, i32 3
-  %568 = extractelement <4 x float> %567, i32 0
-  %569 = extractelement <4 x float> %567, i32 1
-  %570 = insertelement <4 x float> undef, float %568, i32 0
-  %571 = insertelement <4 x float> %570, float %569, i32 1
-  %572 = insertelement <4 x float> %571, float undef, i32 2
-  %573 = insertelement <4 x float> %572, float undef, i32 3
-  %574 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %573, i32 20, i32 4, i32 2)
-  %575 = extractelement <4 x float> %574, i32 3
-  %576 = fcmp oge float %559, %575
-  %577 = sext i1 %576 to i32
-  %578 = bitcast i32 %577 to float
-  %579 = bitcast float %578 to i32
-  %580 = icmp ne i32 %579, 0
-  %.temp92.4 = select i1 %580, float %559, float %temp92.4
-  %581 = fmul float %129, %.temp92.4
-  %582 = fadd float %581, %22
-  %583 = fmul float %130, %.temp92.4
-  %584 = fadd float %583, %23
-  %585 = insertelement <4 x float> undef, float %582, i32 0
-  %586 = insertelement <4 x float> %585, float %584, i32 1
-  %587 = insertelement <4 x float> %586, float 0.000000e+00, i32 2
-  %588 = insertelement <4 x float> %587, float %575, i32 3
-  %589 = extractelement <4 x float> %588, i32 0
-  %590 = extractelement <4 x float> %588, i32 1
-  %591 = insertelement <4 x float> undef, float %589, i32 0
-  %592 = insertelement <4 x float> %591, float %590, i32 1
-  %593 = insertelement <4 x float> %592, float undef, i32 2
-  %594 = insertelement <4 x float> %593, float undef, i32 3
-  %595 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %594, i32 20, i32 4, i32 2)
-  %596 = extractelement <4 x float> %595, i32 0
-  %597 = extractelement <4 x float> %595, i32 1
-  %598 = extractelement <4 x float> %595, i32 2
-  %599 = fmul float %596, 2.000000e+00
-  %600 = fadd float %599, -1.000000e+00
-  %601 = fmul float %597, 2.000000e+00
-  %602 = fadd float %601, -1.000000e+00
-  %603 = fmul float %598, 2.000000e+00
-  %604 = fadd float %603, -1.000000e+00
-  br label %ENDIF136
-
-IF161:                                            ; preds = %ENDIF136
-  %605 = fmul float %202, 0x3FB99999A0000000
-  %606 = fcmp uge float 0x3FE4CCCCC0000000, %605
-  %607 = select i1 %606, float 0x3FE4CCCCC0000000, float %605
-  %608 = fcmp uge float %607, 5.000000e-01
-  %609 = select i1 %608, float 5.000000e-01, float %607
-  %610 = call float @llvm.AMDGPU.lrp(float %609, float %400, float %300)
-  %611 = call float @llvm.AMDGPU.lrp(float %609, float %401, float %301)
-  %612 = call float @llvm.AMDGPU.lrp(float %609, float %402, float %302)
-  %613 = call float @llvm.AMDGPU.lrp(float %609, float %403, float %303)
-  %614 = insertelement <4 x float> undef, float %329, i32 0
-  %615 = insertelement <4 x float> %614, float %330, i32 1
-  %616 = insertelement <4 x float> %615, float %331, i32 2
-  %617 = insertelement <4 x float> %616, float 0.000000e+00, i32 3
-  %618 = insertelement <4 x float> undef, float %63, i32 0
-  %619 = insertelement <4 x float> %618, float %65, i32 1
-  %620 = insertelement <4 x float> %619, float %67, i32 2
-  %621 = insertelement <4 x float> %620, float 0.000000e+00, i32 3
-  %622 = call float @llvm.AMDGPU.dp4(<4 x float> %617, <4 x float> %621)
-  %623 = fcmp uge float 0x3FE6666660000000, %622
-  %624 = select i1 %623, float 0x3FE6666660000000, float %622
-  %625 = fmul float %8, %624
-  %626 = fmul float %13, %624
-  %627 = fmul float %18, %624
-  %628 = insertelement <4 x float> undef, float %34, i32 0
-  %629 = insertelement <4 x float> %628, float %35, i32 1
-  %630 = insertelement <4 x float> %629, float %36, i32 2
-  %631 = insertelement <4 x float> %630, float 0.000000e+00, i32 3
-  %632 = insertelement <4 x float> undef, float %63, i32 0
-  %633 = insertelement <4 x float> %632, float %65, i32 1
-  %634 = insertelement <4 x float> %633, float %67, i32 2
-  %635 = insertelement <4 x float> %634, float 0.000000e+00, i32 3
-  %636 = call float @llvm.AMDGPU.dp4(<4 x float> %631, <4 x float> %635)
-  %637 = fcmp uge float 0x3FECCCCCC0000000, %636
-  %638 = select i1 %637, float 0x3FECCCCCC0000000, float %636
-  %639 = fmul float %625, %638
-  %640 = fmul float %626, %638
-  %641 = fmul float %627, %638
-  br label %ENDIF160
-
-ENDIF160:                                         ; preds = %ENDIF136, %IF161
-  %temp84.0 = phi float [ %610, %IF161 ], [ %255, %ENDIF136 ]
-  %temp85.0 = phi float [ %611, %IF161 ], [ %256, %ENDIF136 ]
-  %temp86.0 = phi float [ %612, %IF161 ], [ %257, %ENDIF136 ]
-  %temp87.0 = phi float [ %613, %IF161 ], [ %258, %ENDIF136 ]
-  %temp92.6 = phi float [ %639, %IF161 ], [ %415, %ENDIF136 ]
-  %temp93.0 = phi float [ %640, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
-  %temp94.0 = phi float [ %641, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
-  %642 = fcmp olt float 2.200000e+03, %179
-  %643 = sext i1 %642 to i32
-  %644 = bitcast i32 %643 to float
-  %645 = fcmp olt float %179, 2.300000e+03
-  %646 = sext i1 %645 to i32
-  %647 = bitcast i32 %646 to float
-  %648 = bitcast float %644 to i32
-  %649 = bitcast float %647 to i32
-  %650 = and i32 %648, %649
-  %651 = bitcast i32 %650 to float
-  %652 = bitcast float %651 to i32
-  %653 = icmp ne i32 %652, 0
-  br i1 %653, label %IF164, label %ENDIF163
-
-IF164:                                            ; preds = %ENDIF160
-  %654 = fmul float %202, 5.000000e-01
-  %655 = fcmp uge float 0x3FE4CCCCC0000000, %654
-  %656 = select i1 %655, float 0x3FE4CCCCC0000000, float %654
-  %657 = fcmp uge float %656, 0x3FD6666660000000
-  %658 = select i1 %657, float 0x3FD6666660000000, float %656
-  %659 = call float @llvm.AMDGPU.lrp(float %658, float %400, float %300)
-  %660 = call float @llvm.AMDGPU.lrp(float %658, float %401, float %301)
-  %661 = call float @llvm.AMDGPU.lrp(float %658, float %402, float %302)
-  %662 = call float @llvm.AMDGPU.lrp(float %658, float %403, float %303)
-  %663 = insertelement <4 x float> undef, float %329, i32 0
-  %664 = insertelement <4 x float> %663, float %330, i32 1
-  %665 = insertelement <4 x float> %664, float %331, i32 2
-  %666 = insertelement <4 x float> %665, float 0.000000e+00, i32 3
-  %667 = insertelement <4 x float> undef, float %63, i32 0
-  %668 = insertelement <4 x float> %667, float %65, i32 1
-  %669 = insertelement <4 x float> %668, float %67, i32 2
-  %670 = insertelement <4 x float> %669, float 0.000000e+00, i32 3
-  %671 = call float @llvm.AMDGPU.dp4(<4 x float> %666, <4 x float> %670)
-  %672 = fcmp uge float 0x3FE6666660000000, %671
-  %673 = select i1 %672, float 0x3FE6666660000000, float %671
-  %674 = fmul float %8, %673
-  %675 = fmul float %13, %673
-  %676 = fmul float %18, %673
-  %677 = insertelement <4 x float> undef, float %34, i32 0
-  %678 = insertelement <4 x float> %677, float %35, i32 1
-  %679 = insertelement <4 x float> %678, float %36, i32 2
-  %680 = insertelement <4 x float> %679, float 0.000000e+00, i32 3
-  %681 = insertelement <4 x float> undef, float %63, i32 0
-  %682 = insertelement <4 x float> %681, float %65, i32 1
-  %683 = insertelement <4 x float> %682, float %67, i32 2
-  %684 = insertelement <4 x float> %683, float 0.000000e+00, i32 3
-  %685 = call float @llvm.AMDGPU.dp4(<4 x float> %680, <4 x float> %684)
-  %686 = fcmp uge float 0x3FECCCCCC0000000, %685
-  %687 = select i1 %686, float 0x3FECCCCCC0000000, float %685
-  %688 = fmul float %674, %687
-  %689 = fmul float %675, %687
-  %690 = fmul float %676, %687
-  br label %ENDIF163
-
-ENDIF163:                                         ; preds = %ENDIF160, %IF164
-  %temp84.1 = phi float [ %659, %IF164 ], [ %temp84.0, %ENDIF160 ]
-  %temp85.1 = phi float [ %660, %IF164 ], [ %temp85.0, %ENDIF160 ]
-  %temp86.1 = phi float [ %661, %IF164 ], [ %temp86.0, %ENDIF160 ]
-  %temp87.1 = phi float [ %662, %IF164 ], [ %temp87.0, %ENDIF160 ]
-  %temp92.7 = phi float [ %688, %IF164 ], [ %temp92.6, %ENDIF160 ]
-  %temp93.1 = phi float [ %689, %IF164 ], [ %temp93.0, %ENDIF160 ]
-  %temp94.1 = phi float [ %690, %IF164 ], [ %temp94.0, %ENDIF160 ]
-  %691 = fcmp oge float %179, 2.300000e+03
-  %692 = sext i1 %691 to i32
-  %693 = bitcast i32 %692 to float
-  %694 = fcmp olt float %179, 2.480000e+03
-  %695 = sext i1 %694 to i32
-  %696 = bitcast i32 %695 to float
-  %697 = bitcast float %693 to i32
-  %698 = bitcast float %696 to i32
-  %699 = and i32 %697, %698
-  %700 = bitcast i32 %699 to float
-  %701 = bitcast float %700 to i32
-  %702 = icmp ne i32 %701, 0
-  br i1 %702, label %IF167, label %ENDIF166
-
-IF167:                                            ; preds = %ENDIF163
-  %703 = fmul float %202, 5.000000e-01
-  %704 = fcmp uge float 0x3FE4CCCCC0000000, %703
-  %705 = select i1 %704, float 0x3FE4CCCCC0000000, float %703
-  %706 = fcmp uge float %705, 0x3FD3333340000000
-  %707 = select i1 %706, float 0x3FD3333340000000, float %705
-  %708 = call float @llvm.AMDGPU.lrp(float %707, float %409, float %300)
-  %709 = call float @llvm.AMDGPU.lrp(float %707, float %410, float %301)
-  %710 = call float @llvm.AMDGPU.lrp(float %707, float %411, float %302)
-  %711 = call float @llvm.AMDGPU.lrp(float %707, float %412, float %303)
-  %712 = insertelement <4 x float> undef, float %329, i32 0
-  %713 = insertelement <4 x float> %712, float %330, i32 1
-  %714 = insertelement <4 x float> %713, float %331, i32 2
-  %715 = insertelement <4 x float> %714, float 0.000000e+00, i32 3
-  %716 = insertelement <4 x float> undef, float %63, i32 0
-  %717 = insertelement <4 x float> %716, float %65, i32 1
-  %718 = insertelement <4 x float> %717, float %67, i32 2
-  %719 = insertelement <4 x float> %718, float 0.000000e+00, i32 3
-  %720 = call float @llvm.AMDGPU.dp4(<4 x float> %715, <4 x float> %719)
-  %721 = fcmp uge float 0x3FEB333340000000, %720
-  %722 = select i1 %721, float 0x3FEB333340000000, float %720
-  %723 = fmul float %8, %722
-  %724 = fmul float %13, %722
-  %725 = fmul float %18, %722
-  %726 = insertelement <4 x float> undef, float %34, i32 0
-  %727 = insertelement <4 x float> %726, float %35, i32 1
-  %728 = insertelement <4 x float> %727, float %36, i32 2
-  %729 = insertelement <4 x float> %728, float 0.000000e+00, i32 3
-  %730 = insertelement <4 x float> undef, float %63, i32 0
-  %731 = insertelement <4 x float> %730, float %65, i32 1
-  %732 = insertelement <4 x float> %731, float %67, i32 2
-  %733 = insertelement <4 x float> %732, float 0.000000e+00, i32 3
-  %734 = call float @llvm.AMDGPU.dp4(<4 x float> %729, <4 x float> %733)
-  %735 = fcmp uge float 0x3FECCCCCC0000000, %734
-  %736 = select i1 %735, float 0x3FECCCCCC0000000, float %734
-  %737 = fmul float %723, %736
-  %738 = fmul float %724, %736
-  %739 = fmul float %725, %736
-  br label %ENDIF166
-
-ENDIF166:                                         ; preds = %ENDIF163, %IF167
-  %temp84.2 = phi float [ %708, %IF167 ], [ %temp84.1, %ENDIF163 ]
-  %temp85.2 = phi float [ %709, %IF167 ], [ %temp85.1, %ENDIF163 ]
-  %temp86.2 = phi float [ %710, %IF167 ], [ %temp86.1, %ENDIF163 ]
-  %temp87.2 = phi float [ %711, %IF167 ], [ %temp87.1, %ENDIF163 ]
-  %temp92.8 = phi float [ %737, %IF167 ], [ %temp92.7, %ENDIF163 ]
-  %temp93.2 = phi float [ %738, %IF167 ], [ %temp93.1, %ENDIF163 ]
-  %temp94.2 = phi float [ %739, %IF167 ], [ %temp94.1, %ENDIF163 ]
-  %740 = fcmp oge float %179, 2.480000e+03
-  %741 = sext i1 %740 to i32
-  %742 = bitcast i32 %741 to float
-  %743 = fcmp olt float %179, 2.530000e+03
-  %744 = sext i1 %743 to i32
-  %745 = bitcast i32 %744 to float
-  %746 = bitcast float %742 to i32
-  %747 = bitcast float %745 to i32
-  %748 = and i32 %746, %747
-  %749 = bitcast i32 %748 to float
-  %750 = bitcast float %749 to i32
-  %751 = icmp ne i32 %750, 0
-  br i1 %751, label %IF170, label %ENDIF169
-
-IF170:                                            ; preds = %ENDIF166
-  %752 = fmul float %202, 5.000000e-01
-  %753 = fcmp uge float 0x3FE4CCCCC0000000, %752
-  %754 = select i1 %753, float 0x3FE4CCCCC0000000, float %752
-  %755 = fcmp uge float %754, 0x3FC99999A0000000
-  %756 = select i1 %755, float 0x3FC99999A0000000, float %754
-  %757 = call float @llvm.AMDGPU.lrp(float %756, float %409, float %300)
-  %758 = call float @llvm.AMDGPU.lrp(float %756, float %410, float %301)
-  %759 = call float @llvm.AMDGPU.lrp(float %756, float %411, float %302)
-  %760 = call float @llvm.AMDGPU.lrp(float %756, float %412, float %303)
-  %761 = insertelement <4 x float> undef, float %329, i32 0
-  %762 = insertelement <4 x float> %761, float %330, i32 1
-  %763 = insertelement <4 x float> %762, float %331, i32 2
-  %764 = insertelement <4 x float> %763, float 0.000000e+00, i32 3
-  %765 = insertelement <4 x float> undef, float %63, i32 0
-  %766 = insertelement <4 x float> %765, float %65, i32 1
-  %767 = insertelement <4 x float> %766, float %67, i32 2
-  %768 = insertelement <4 x float> %767, float 0.000000e+00, i32 3
-  %769 = call float @llvm.AMDGPU.dp4(<4 x float> %764, <4 x float> %768)
-  %770 = fcmp uge float 0x3FEB333340000000, %769
-  %771 = select i1 %770, float 0x3FEB333340000000, float %769
-  %772 = fmul float %8, %771
-  %773 = fmul float %13, %771
-  %774 = fmul float %18, %771
-  %775 = insertelement <4 x float> undef, float %34, i32 0
-  %776 = insertelement <4 x float> %775, float %35, i32 1
-  %777 = insertelement <4 x float> %776, float %36, i32 2
-  %778 = insertelement <4 x float> %777, float 0.000000e+00, i32 3
-  %779 = insertelement <4 x float> undef, float %63, i32 0
-  %780 = insertelement <4 x float> %779, float %65, i32 1
-  %781 = insertelement <4 x float> %780, float %67, i32 2
-  %782 = insertelement <4 x float> %781, float 0.000000e+00, i32 3
-  %783 = call float @llvm.AMDGPU.dp4(<4 x float> %778, <4 x float> %782)
-  %784 = fcmp uge float 0x3FECCCCCC0000000, %783
-  %785 = select i1 %784, float 0x3FECCCCCC0000000, float %783
-  %786 = fmul float %772, %785
-  %787 = fmul float %773, %785
-  %788 = fmul float %774, %785
-  br label %ENDIF169
-
-ENDIF169:                                         ; preds = %ENDIF166, %IF170
-  %temp84.3 = phi float [ %757, %IF170 ], [ %temp84.2, %ENDIF166 ]
-  %temp85.3 = phi float [ %758, %IF170 ], [ %temp85.2, %ENDIF166 ]
-  %temp86.3 = phi float [ %759, %IF170 ], [ %temp86.2, %ENDIF166 ]
-  %temp87.3 = phi float [ %760, %IF170 ], [ %temp87.2, %ENDIF166 ]
-  %temp92.9 = phi float [ %786, %IF170 ], [ %temp92.8, %ENDIF166 ]
-  %temp93.3 = phi float [ %787, %IF170 ], [ %temp93.2, %ENDIF166 ]
-  %temp94.3 = phi float [ %788, %IF170 ], [ %temp94.2, %ENDIF166 ]
-  %789 = fcmp oge float %179, 2.530000e+03
-  %790 = sext i1 %789 to i32
-  %791 = bitcast i32 %790 to float
-  %792 = fcmp olt float %179, 2.670000e+03
-  %793 = sext i1 %792 to i32
-  %794 = bitcast i32 %793 to float
-  %795 = bitcast float %791 to i32
-  %796 = bitcast float %794 to i32
-  %797 = and i32 %795, %796
-  %798 = bitcast i32 %797 to float
-  %799 = bitcast float %798 to i32
-  %800 = icmp ne i32 %799, 0
-  br i1 %800, label %IF173, label %ENDIF172
-
-IF173:                                            ; preds = %ENDIF169
-  %801 = fmul float %202, 5.000000e-01
-  %802 = fcmp uge float 0x3FE4CCCCC0000000, %801
-  %803 = select i1 %802, float 0x3FE4CCCCC0000000, float %801
-  %804 = fcmp uge float %803, 0x3FB99999A0000000
-  %805 = select i1 %804, float 0x3FB99999A0000000, float %803
-  %806 = call float @llvm.AMDGPU.lrp(float %805, float %400, float %300)
-  %807 = call float @llvm.AMDGPU.lrp(float %805, float %401, float %301)
-  %808 = call float @llvm.AMDGPU.lrp(float %805, float %402, float %302)
-  %809 = call float @llvm.AMDGPU.lrp(float %805, float %403, float %303)
-  %810 = insertelement <4 x float> undef, float %329, i32 0
-  %811 = insertelement <4 x float> %810, float %330, i32 1
-  %812 = insertelement <4 x float> %811, float %331, i32 2
-  %813 = insertelement <4 x float> %812, float 0.000000e+00, i32 3
-  %814 = insertelement <4 x float> undef, float %63, i32 0
-  %815 = insertelement <4 x float> %814, float %65, i32 1
-  %816 = insertelement <4 x float> %815, float %67, i32 2
-  %817 = insertelement <4 x float> %816, float 0.000000e+00, i32 3
-  %818 = call float @llvm.AMDGPU.dp4(<4 x float> %813, <4 x float> %817)
-  %819 = fcmp uge float 0x3FEB333340000000, %818
-  %820 = select i1 %819, float 0x3FEB333340000000, float %818
-  %821 = fmul float %8, %820
-  %822 = fmul float %13, %820
-  %823 = fmul float %18, %820
-  %824 = insertelement <4 x float> undef, float %34, i32 0
-  %825 = insertelement <4 x float> %824, float %35, i32 1
-  %826 = insertelement <4 x float> %825, float %36, i32 2
-  %827 = insertelement <4 x float> %826, float 0.000000e+00, i32 3
-  %828 = insertelement <4 x float> undef, float %63, i32 0
-  %829 = insertelement <4 x float> %828, float %65, i32 1
-  %830 = insertelement <4 x float> %829, float %67, i32 2
-  %831 = insertelement <4 x float> %830, float 0.000000e+00, i32 3
-  %832 = call float @llvm.AMDGPU.dp4(<4 x float> %827, <4 x float> %831)
-  %833 = fcmp uge float 0x3FECCCCCC0000000, %832
-  %834 = select i1 %833, float 0x3FECCCCCC0000000, float %832
-  %835 = fmul float %821, %834
-  %836 = fmul float %822, %834
-  %837 = fmul float %823, %834
-  br label %ENDIF172
-
-ENDIF172:                                         ; preds = %ENDIF169, %IF173
-  %temp84.4 = phi float [ %806, %IF173 ], [ %temp84.3, %ENDIF169 ]
-  %temp85.4 = phi float [ %807, %IF173 ], [ %temp85.3, %ENDIF169 ]
-  %temp86.4 = phi float [ %808, %IF173 ], [ %temp86.3, %ENDIF169 ]
-  %temp87.4 = phi float [ %809, %IF173 ], [ %temp87.3, %ENDIF169 ]
-  %temp92.10 = phi float [ %835, %IF173 ], [ %temp92.9, %ENDIF169 ]
-  %temp93.4 = phi float [ %836, %IF173 ], [ %temp93.3, %ENDIF169 ]
-  %temp94.4 = phi float [ %837, %IF173 ], [ %temp94.3, %ENDIF169 ]
-  %838 = fcmp oge float %179, 2.670000e+03
-  %839 = sext i1 %838 to i32
-  %840 = bitcast i32 %839 to float
-  %841 = bitcast float %840 to i32
-  %842 = icmp ne i32 %841, 0
-  br i1 %842, label %IF176, label %ENDIF175
-
-IF176:                                            ; preds = %ENDIF172
-  %843 = fmul float %202, 0x3FB99999A0000000
-  %844 = fcmp uge float 0.000000e+00, %843
-  %845 = select i1 %844, float 0.000000e+00, float %843
-  %846 = fcmp uge float %845, 0x3FD99999A0000000
-  %847 = select i1 %846, float 0x3FD99999A0000000, float %845
-  %848 = call float @llvm.AMDGPU.lrp(float %847, float %400, float %300)
-  %849 = call float @llvm.AMDGPU.lrp(float %847, float %401, float %301)
-  %850 = call float @llvm.AMDGPU.lrp(float %847, float %402, float %302)
-  %851 = call float @llvm.AMDGPU.lrp(float %847, float %403, float %303)
-  %852 = insertelement <4 x float> undef, float %329, i32 0
-  %853 = insertelement <4 x float> %852, float %330, i32 1
-  %854 = insertelement <4 x float> %853, float %331, i32 2
-  %855 = insertelement <4 x float> %854, float 0.000000e+00, i32 3
-  %856 = insertelement <4 x float> undef, float %63, i32 0
-  %857 = insertelement <4 x float> %856, float %65, i32 1
-  %858 = insertelement <4 x float> %857, float %67, i32 2
-  %859 = insertelement <4 x float> %858, float 0.000000e+00, i32 3
-  %860 = call float @llvm.AMDGPU.dp4(<4 x float> %855, <4 x float> %859)
-  %861 = fcmp uge float 0x3FEB333340000000, %860
-  %862 = select i1 %861, float 0x3FEB333340000000, float %860
-  %863 = fmul float %8, %862
-  %864 = fmul float %13, %862
-  %865 = fmul float %18, %862
-  %866 = insertelement <4 x float> undef, float %34, i32 0
-  %867 = insertelement <4 x float> %866, float %35, i32 1
-  %868 = insertelement <4 x float> %867, float %36, i32 2
-  %869 = insertelement <4 x float> %868, float 0.000000e+00, i32 3
-  %870 = insertelement <4 x float> undef, float %63, i32 0
-  %871 = insertelement <4 x float> %870, float %65, i32 1
-  %872 = insertelement <4 x float> %871, float %67, i32 2
-  %873 = insertelement <4 x float> %872, float 0.000000e+00, i32 3
-  %874 = call float @llvm.AMDGPU.dp4(<4 x float> %869, <4 x float> %873)
-  %875 = fcmp uge float 0x3FECCCCCC0000000, %874
-  %876 = select i1 %875, float 0x3FECCCCCC0000000, float %874
-  %877 = fmul float %863, %876
-  %878 = fmul float %864, %876
-  %879 = fmul float %865, %876
-  br label %ENDIF175
-
-ENDIF175:                                         ; preds = %ENDIF172, %IF176
-  %temp84.5 = phi float [ %848, %IF176 ], [ %temp84.4, %ENDIF172 ]
-  %temp85.5 = phi float [ %849, %IF176 ], [ %temp85.4, %ENDIF172 ]
-  %temp86.5 = phi float [ %850, %IF176 ], [ %temp86.4, %ENDIF172 ]
-  %temp87.5 = phi float [ %851, %IF176 ], [ %temp87.4, %ENDIF172 ]
-  %temp92.11 = phi float [ %877, %IF176 ], [ %temp92.10, %ENDIF172 ]
-  %temp93.5 = phi float [ %878, %IF176 ], [ %temp93.4, %ENDIF172 ]
-  %temp94.5 = phi float [ %879, %IF176 ], [ %temp94.4, %ENDIF172 ]
-  %880 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %881 = extractelement <4 x float> %880, i32 0
-  %882 = fcmp olt float %881, %179
-  %883 = sext i1 %882 to i32
-  %884 = bitcast i32 %883 to float
-  %885 = bitcast float %884 to i32
-  %886 = icmp ne i32 %885, 0
-  br i1 %886, label %IF179, label %ENDIF178
-
-IF179:                                            ; preds = %ENDIF175
-  %887 = fadd float %202, 1.000000e+00
-  %888 = fadd float %202, 1.000000e+00
-  %889 = fadd float %202, 1.000000e+00
-  %890 = insertelement <4 x float> undef, float %43, i32 0
-  %891 = insertelement <4 x float> %890, float %44, i32 1
-  %892 = insertelement <4 x float> %891, float %45, i32 2
-  %893 = insertelement <4 x float> %892, float 0.000000e+00, i32 3
-  %894 = insertelement <4 x float> undef, float %43, i32 0
-  %895 = insertelement <4 x float> %894, float %44, i32 1
-  %896 = insertelement <4 x float> %895, float %45, i32 2
-  %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3
-  %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897)
-  %899 = call float @llvm.AMDGPU.rsq.f32(float %898)
-  %900 = fmul float %45, %899
-  %901 = call float @fabs(float %900)
-  %902 = fmul float %176, 0x3FECCCCCC0000000
-  %903 = fadd float %902, %901
-  %904 = fadd float %903, 0xBFEFAE1480000000
-  %905 = fmul float %904, 0xC043FFFE20000000
-  %906 = call float @llvm.AMDIL.clamp.(float %905, float 0.000000e+00, float 1.000000e+00)
-  %907 = fmul float 2.000000e+00, %906
-  %908 = fsub float -0.000000e+00, %907
-  %909 = fadd float 3.000000e+00, %908
-  %910 = fmul float %906, %909
-  %911 = fmul float %906, %910
-  %912 = call float @llvm.AMDGPU.lrp(float %911, float %temp84.5, float %887)
-  %913 = call float @llvm.AMDGPU.lrp(float %911, float %temp85.5, float %888)
-  %914 = call float @llvm.AMDGPU.lrp(float %911, float %temp86.5, float %889)
-  %915 = call float @llvm.AMDGPU.lrp(float %911, float %temp87.5, float 0.000000e+00)
-  %916 = fmul float %202, 5.000000e-01
-  %917 = fcmp uge float 0x3FE4CCCCC0000000, %916
-  %918 = select i1 %917, float 0x3FE4CCCCC0000000, float %916
-  %919 = fcmp uge float %918, 0x3FE3333340000000
-  %920 = select i1 %919, float 0x3FE3333340000000, float %918
-  %921 = call float @llvm.AMDGPU.lrp(float %920, float %912, float %temp84.5)
-  %922 = call float @llvm.AMDGPU.lrp(float %920, float %913, float %temp85.5)
-  %923 = call float @llvm.AMDGPU.lrp(float %920, float %914, float %temp86.5)
-  %924 = call float @llvm.AMDGPU.lrp(float %920, float %915, float %temp87.5)
-  %925 = insertelement <4 x float> undef, float %329, i32 0
-  %926 = insertelement <4 x float> %925, float %330, i32 1
-  %927 = insertelement <4 x float> %926, float %331, i32 2
-  %928 = insertelement <4 x float> %927, float 0.000000e+00, i32 3
-  %929 = insertelement <4 x float> undef, float %63, i32 0
-  %930 = insertelement <4 x float> %929, float %65, i32 1
-  %931 = insertelement <4 x float> %930, float %67, i32 2
-  %932 = insertelement <4 x float> %931, float 0.000000e+00, i32 3
-  %933 = call float @llvm.AMDGPU.dp4(<4 x float> %928, <4 x float> %932)
-  %934 = fcmp uge float 0x3FE99999A0000000, %933
-  %935 = select i1 %934, float 0x3FE99999A0000000, float %933
-  %936 = fmul float %8, %935
-  %937 = fmul float %13, %935
-  %938 = fmul float %18, %935
-  %939 = insertelement <4 x float> undef, float %34, i32 0
-  %940 = insertelement <4 x float> %939, float %35, i32 1
-  %941 = insertelement <4 x float> %940, float %36, i32 2
-  %942 = insertelement <4 x float> %941, float 0.000000e+00, i32 3
-  %943 = insertelement <4 x float> undef, float %63, i32 0
-  %944 = insertelement <4 x float> %943, float %65, i32 1
-  %945 = insertelement <4 x float> %944, float %67, i32 2
-  %946 = insertelement <4 x float> %945, float 0.000000e+00, i32 3
-  %947 = call float @llvm.AMDGPU.dp4(<4 x float> %942, <4 x float> %946)
-  %948 = fcmp uge float 0x3FECCCCCC0000000, %947
-  %949 = select i1 %948, float 0x3FECCCCCC0000000, float %947
-  %950 = fmul float %936, %949
-  %951 = fmul float %937, %949
-  %952 = fmul float %938, %949
-  br label %ENDIF178
-
-ENDIF178:                                         ; preds = %ENDIF175, %IF179
-  %temp84.6 = phi float [ %921, %IF179 ], [ %temp84.5, %ENDIF175 ]
-  %temp85.6 = phi float [ %922, %IF179 ], [ %temp85.5, %ENDIF175 ]
-  %temp86.6 = phi float [ %923, %IF179 ], [ %temp86.5, %ENDIF175 ]
-  %temp87.6 = phi float [ %924, %IF179 ], [ %temp87.5, %ENDIF175 ]
-  %temp92.12 = phi float [ %950, %IF179 ], [ %temp92.11, %ENDIF175 ]
-  %temp93.6 = phi float [ %951, %IF179 ], [ %temp93.5, %ENDIF175 ]
-  %temp94.6 = phi float [ %952, %IF179 ], [ %temp94.5, %ENDIF175 ]
-  %953 = fmul float %55, %temp92.12
-  %954 = fmul float %57, %temp93.6
-  %955 = fmul float %59, %temp94.6
-  %956 = fmul float %61, 0.000000e+00
-  %957 = fmul float %temp84.6, %953
-  %958 = fmul float %temp85.6, %954
-  %959 = fmul float %temp86.6, %955
-  %960 = fmul float %temp87.6, %956
-  %961 = fmul float %2, -2.000000e+00
-  %962 = fadd float %961, 1.000000e+00
-  %963 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
-  %964 = extractelement <4 x float> %963, i32 2
-  %965 = fsub float -0.000000e+00, %964
-  %966 = fadd float %962, %965
-  %967 = fdiv float 1.000000e+00, %966
-  %968 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24)
-  %969 = extractelement <4 x float> %968, i32 2
-  %970 = fmul float %969, %967
-  %971 = fsub float -0.000000e+00, %53
-  %972 = fmul float %971, %53
-  %973 = fmul float %972, %970
-  %974 = fmul float %973, %970
-  %975 = fmul float %974, 0x3FF7154760000000
-  %976 = call float @llvm.AMDIL.exp.(float %975)
-  %977 = fcmp oeq float %53, 1.000000e+00
-  %978 = sext i1 %977 to i32
-  %979 = bitcast i32 %978 to float
-  %980 = bitcast float %979 to i32
-  %981 = icmp ne i32 %980, 0
-  %.184 = select i1 %981, float 1.000000e+00, float %976
-  %982 = call float @llvm.AMDGPU.lrp(float %.184, float %957, float %47)
-  %983 = call float @llvm.AMDGPU.lrp(float %.184, float %958, float %49)
-  %984 = call float @llvm.AMDGPU.lrp(float %.184, float %959, float %51)
-  %985 = insertelement <4 x float> undef, float %982, i32 0
-  %986 = insertelement <4 x float> %985, float %983, i32 1
-  %987 = insertelement <4 x float> %986, float %984, i32 2
-  %988 = insertelement <4 x float> %987, float %960, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %988, i32 0, i32 0)
-  ret void
-}
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq.f32(float) #1
-
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
-
-; Function Attrs: readonly
-declare float @fabs(float) #2
-
-; Function Attrs: readnone
-declare float @llvm.AMDIL.exp.(float) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.lrp(float, float, float) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDIL.clamp.(float, float, float) #1
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { readnone }
-attributes #2 = { readonly }
diff --git a/test/CodeGen/R600/bitcast.ll b/test/CodeGen/R600/bitcast.ll
deleted file mode 100644
index fd56d956bf3..00000000000
--- a/test/CodeGen/R600/bitcast.ll
+++ /dev/null
@@ -1,79 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; This test just checks that the compiler doesn't crash.
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-; FUNC-LABEL: {{^}}v32i8_to_v8i32:
-; SI: s_endpgm
-define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
-entry:
-  %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0
-  %2 = bitcast <32 x i8> %1 to <8 x i32>
-  %3 = extractelement <8 x i32> %2, i32 1
-  %4 = icmp ne i32 %3, 0
-  %5 = select i1 %4, float 0.0, float 1.0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i8ptr_v16i8ptr:
-; SI: s_endpgm
-define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
-entry:
-  %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
-  %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
-  store <16 x i8> %1, <16 x i8> addrspace(1)* %out
-  ret void
-}
-
-define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %load = load float, float addrspace(1)* %in, align 4
-  %bc = bitcast float %load to <2 x i16>
-  store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4
-  ret void
-}
-
-define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
-  %bc = bitcast <2 x i16> %load to float
-  store float %bc, float addrspace(1)* %out, align 4
-  ret void
-}
-
-define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
-  %bc = bitcast <4 x i8> %load to i32
-  store i32 %bc, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %bc = bitcast i32 %load to <4 x i8>
-  store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64:
-; SI: s_endpgm
-define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
-  %add = add <2 x i32> %val, <i32 4, i32 9>
-  %bc = bitcast <2 x i32> %add to double
-  store double %bc, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
-; SI: s_endpgm
-define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
-  %val = load double, double addrspace(1)* %in, align 8
-  %add = fadd double %val, 4.0
-  %bc = bitcast double %add to <2 x i32>
-  store <2 x i32> %bc, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/bswap.ll b/test/CodeGen/R600/bswap.ll
deleted file mode 100644
index 4cf8e4bfed5..00000000000
--- a/test/CodeGen/R600/bswap.ll
+++ /dev/null
@@ -1,115 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.bswap.i32(i32) nounwind readnone
-declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
-declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone
-declare i64 @llvm.bswap.i64(i64) nounwind readnone
-declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone
-declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
-
-; FUNC-LABEL: @test_bswap_i32
-; SI: buffer_load_dword [[VAL:v[0-9]+]]
-; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8
-; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24
-; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff
-; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
-  store i32 %bswap, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @test_bswap_v2i32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_bfi_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_bfi_b32
-; SI: s_endpgm
-define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
-  %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
-  store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: @test_bswap_v4i32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_bfi_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_bfi_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_bfi_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_bfi_b32
-; SI: s_endpgm
-define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
-  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
-  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
-  store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: @test_bswap_v8i32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_bfi_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_bfi_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_bfi_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_bfi_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_bfi_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_bfi_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_bfi_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_bfi_b32
-; SI: s_endpgm
-define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
-  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
-  %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
-  store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32
-  ret void
-}
-
-define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
-  %val = load i64, i64 addrspace(1)* %in, align 8
-  %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
-  store i64 %bswap, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
-  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
-  %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
-  store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16
-  ret void
-}
-
-define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
-  %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
-  %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
-  store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32
-  ret void
-}
diff --git a/test/CodeGen/R600/build_vector.ll b/test/CodeGen/R600/build_vector.ll
deleted file mode 100644
index 65eacf5adc4..00000000000
--- a/test/CodeGen/R600/build_vector.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
-
-; R600: {{^}}build_vector2:
-; R600: MOV
-; R600: MOV
-; R600-NOT: MOV
-; SI: {{^}}build_vector2:
-; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
-; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
-; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}}
-define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
-entry:
-  store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; R600: {{^}}build_vector4:
-; R600: MOV
-; R600: MOV
-; R600: MOV
-; R600: MOV
-; R600-NOT: MOV
-; SI: {{^}}build_vector4:
-; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
-; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
-; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
-; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
-; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}}
-define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
-entry:
-  store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/call.ll b/test/CodeGen/R600/call.ll
deleted file mode 100644
index e769fd11c28..00000000000
--- a/test/CodeGen/R600/call.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: not llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s
-; RUN: not llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s 2>&1 | FileCheck %s
-; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s
-
-; CHECK: error: unsupported call to function external_function in test_call_external
-
-
-declare i32 @external_function(i32) nounwind
-
-define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
-  %c = call i32 @external_function(i32 %b) nounwind
-  %result = add i32 %a, %c
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-define i32 @defined_function(i32 %x) nounwind noinline {
-  %y = add i32 %x, 8
-  ret i32 %y
-}
-
-define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
-  %c = call i32 @defined_function(i32 %b) nounwind
-  %result = add i32 %a, %c
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/call_fs.ll b/test/CodeGen/R600/call_fs.ll
deleted file mode 100644
index 87bebbc49d5..00000000000
--- a/test/CodeGen/R600/call_fs.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-
-; RUN: llc < %s -march=r600 -mcpu=redwood -show-mc-encoding -o - | FileCheck --check-prefix=EG %s
-; RUN: llc < %s -march=r600 -mcpu=rv710 -show-mc-encoding -o - | FileCheck --check-prefix=R600 %s
-
-; EG: .long 257
-; EG: {{^}}call_fs:
-; EG: CALL_FS  ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0xc0,0x84]
-; R600: .long 257
-; R600: {{^}}call_fs:
-; R600:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89]
-
-
-define void @call_fs() #0 {
-  ret void
-}
-
-attributes #0 = { "ShaderType"="1" } ; Vertex Shader
diff --git a/test/CodeGen/R600/cayman-loop-bug.ll b/test/CodeGen/R600/cayman-loop-bug.ll
deleted file mode 100644
index c7b8c403731..00000000000
--- a/test/CodeGen/R600/cayman-loop-bug.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
-
-; CHECK-LABEL: {{^}}main:
-; CHECK: LOOP_START_DX10
-; CHECK: ALU_PUSH_BEFORE
-; CHECK: LOOP_START_DX10
-; CHECK: PUSH
-; CHECK-NOT: ALU_PUSH_BEFORE
-; CHECK: END_LOOP
-; CHECK: END_LOOP
-define void @main (<4 x float> inreg %reg0) #0 {
-entry:
-  br label %outer_loop
-outer_loop:
-  %cnt = phi i32 [0, %entry], [%cnt_incr, %inner_loop]
-  %cond = icmp eq i32 %cnt, 16
-  br i1 %cond, label %outer_loop_body, label %exit
-outer_loop_body:
-  %cnt_incr = add i32 %cnt, 1
-  br label %inner_loop
-inner_loop:
-  %cnt2 = phi i32 [0, %outer_loop_body], [%cnt2_incr, %inner_loop_body]
-  %cond2 = icmp eq i32 %cnt2, 16
-  br i1 %cond, label %inner_loop_body, label %outer_loop
-inner_loop_body:
-  %cnt2_incr = add i32 %cnt2, 1
-  br label %inner_loop
-exit:
-  ret void
-}
-
-attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
diff --git a/test/CodeGen/R600/cf-stack-bug.ll b/test/CodeGen/R600/cf-stack-bug.ll
deleted file mode 100644
index 75b87e48622..00000000000
--- a/test/CodeGen/R600/cf-stack-bug.ll
+++ /dev/null
@@ -1,244 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
-; RUN: FileCheck --check-prefix=BUG64 %s < %t
-
-; RUN: llc -march=r600 -mcpu=sumo -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
-; RUN: FileCheck --check-prefix=BUG64 %s < %t
-
-; RUN: llc -march=r600 -mcpu=barts -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
-; RUN: FileCheck --check-prefix=BUG64 %s < %t
-
-; RUN: llc -march=r600 -mcpu=turks -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
-; RUN: FileCheck --check-prefix=BUG64 %s < %t
-
-; RUN: llc -march=r600 -mcpu=caicos -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
-; RUN: FileCheck --check-prefix=BUG64 %s < %t
-
-; RUN: llc -march=r600 -mcpu=cedar -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
-; RUN: FileCheck --check-prefix=BUG32 %s < %t
-
-; RUN: llc -march=r600 -mcpu=juniper -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
-; RUN: FileCheck --check-prefix=NOBUG %s < %t
-
-; RUN: llc -march=r600 -mcpu=cypress -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
-; RUN: FileCheck --check-prefix=NOBUG %s < %t
-
-; RUN: llc -march=r600 -mcpu=cayman -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
-; RUN: FileCheck --check-prefix=NOBUG %s < %t
-
-; REQUIRES: asserts
-
-; We are currently allocating 2 extra sub-entries on Evergreen / NI for
-; non-WQM push instructions if we change this to 1, then we will need to
-; add one level of depth to each of these tests.
-
-; BUG64-NOT: Applying bug work-around
-; BUG32-NOT: Applying bug work-around
-; NOBUG-NOT: Applying bug work-around
-; FUNC-LABEL: {{^}}nested3:
-define void @nested3(i32 addrspace(1)* %out, i32 %cond) {
-entry:
-  %0 = icmp sgt i32 %cond, 0
-  br i1 %0, label %if.1, label %end
-
-if.1:
-  %1 = icmp sgt i32 %cond, 10
-  br i1 %1, label %if.2, label %if.store.1
-
-if.store.1:
-  store i32 1, i32 addrspace(1)* %out
-  br label %end
-
-if.2:
-  %2 = icmp sgt i32 %cond, 20
-  br i1 %2, label %if.3, label %if.2.store
-
-if.2.store:
-  store i32 2, i32 addrspace(1)* %out
-  br label %end
-
-if.3:
-  store i32 3, i32 addrspace(1)* %out
-  br label %end
-
-end:
-  ret void
-}
-
-; BUG64: Applying bug work-around
-; BUG32-NOT: Applying bug work-around
-; NOBUG-NOT: Applying bug work-around
-; FUNC-LABEL: {{^}}nested4:
-define void @nested4(i32 addrspace(1)* %out, i32 %cond) {
-entry:
-  %0 = icmp sgt i32 %cond, 0
-  br i1 %0, label %if.1, label %end
-
-if.1:
-  %1 = icmp sgt i32 %cond, 10
-  br i1 %1, label %if.2, label %if.1.store
-
-if.1.store:
-  store i32 1, i32 addrspace(1)* %out
-  br label %end
-
-if.2:
-  %2 = icmp sgt i32 %cond, 20
-  br i1 %2, label %if.3, label %if.2.store
-
-if.2.store:
-  store i32 2, i32 addrspace(1)* %out
-  br label %end
-
-if.3:
-  %3 = icmp sgt i32 %cond, 30
-  br i1 %3, label %if.4, label %if.3.store
-
-if.3.store:
-  store i32 3, i32 addrspace(1)* %out
-  br label %end
-
-if.4:
-  store i32 4, i32 addrspace(1)* %out
-  br label %end
-
-end:
-  ret void
-}
-
-; BUG64: Applying bug work-around
-; BUG32-NOT: Applying bug work-around
-; NOBUG-NOT: Applying bug work-around
-; FUNC-LABEL: {{^}}nested7:
-define void @nested7(i32 addrspace(1)* %out, i32 %cond) {
-entry:
-  %0 = icmp sgt i32 %cond, 0
-  br i1 %0, label %if.1, label %end
-
-if.1:
-  %1 = icmp sgt i32 %cond, 10
-  br i1 %1, label %if.2, label %if.1.store
-
-if.1.store:
-  store i32 1, i32 addrspace(1)* %out
-  br label %end
-
-if.2:
-  %2 = icmp sgt i32 %cond, 20
-  br i1 %2, label %if.3, label %if.2.store
-
-if.2.store:
-  store i32 2, i32 addrspace(1)* %out
-  br label %end
-
-if.3:
-  %3 = icmp sgt i32 %cond, 30
-  br i1 %3, label %if.4, label %if.3.store
-
-if.3.store:
-  store i32 3, i32 addrspace(1)* %out
-  br label %end
-
-if.4:
-  %4 = icmp sgt i32 %cond, 40
-  br i1 %4, label %if.5, label %if.4.store
-
-if.4.store:
-  store i32 4, i32 addrspace(1)* %out
-  br label %end
-
-if.5:
-  %5 = icmp sgt i32 %cond, 50
-  br i1 %5, label %if.6, label %if.5.store
-
-if.5.store:
-  store i32 5, i32 addrspace(1)* %out
-  br label %end
-
-if.6:
-  %6 = icmp sgt i32 %cond, 60
-  br i1 %6, label %if.7, label %if.6.store
-
-if.6.store:
-  store i32 6, i32 addrspace(1)* %out
-  br label %end
-
-if.7:
-  store i32 7, i32 addrspace(1)* %out
-  br label %end
-
-end:
-  ret void
-}
-
-; BUG64: Applying bug work-around
-; BUG32: Applying bug work-around
-; NOBUG-NOT: Applying bug work-around
-; FUNC-LABEL: {{^}}nested8:
-define void @nested8(i32 addrspace(1)* %out, i32 %cond) {
-entry:
-  %0 = icmp sgt i32 %cond, 0
-  br i1 %0, label %if.1, label %end
-
-if.1:
-  %1 = icmp sgt i32 %cond, 10
-  br i1 %1, label %if.2, label %if.1.store
-
-if.1.store:
-  store i32 1, i32 addrspace(1)* %out
-  br label %end
-
-if.2:
-  %2 = icmp sgt i32 %cond, 20
-  br i1 %2, label %if.3, label %if.2.store
-
-if.2.store:
-  store i32 2, i32 addrspace(1)* %out
-  br label %end
-
-if.3:
-  %3 = icmp sgt i32 %cond, 30
-  br i1 %3, label %if.4, label %if.3.store
-
-if.3.store:
-  store i32 3, i32 addrspace(1)* %out
-  br label %end
-
-if.4:
-  %4 = icmp sgt i32 %cond, 40
-  br i1 %4, label %if.5, label %if.4.store
-
-if.4.store:
-  store i32 4, i32 addrspace(1)* %out
-  br label %end
-
-if.5:
-  %5 = icmp sgt i32 %cond, 50
-  br i1 %5, label %if.6, label %if.5.store
-
-if.5.store:
-  store i32 5, i32 addrspace(1)* %out
-  br label %end
-
-if.6:
-  %6 = icmp sgt i32 %cond, 60
-  br i1 %6, label %if.7, label %if.6.store
-
-if.6.store:
-  store i32 6, i32 addrspace(1)* %out
-  br label %end
-
-if.7:
-  %7 = icmp sgt i32 %cond, 70
-  br i1 %7, label %if.8, label %if.7.store
-
-if.7.store:
-  store i32 7, i32 addrspace(1)* %out
-  br label %end
-
-if.8:
-  store i32 8, i32 addrspace(1)* %out
-  br label %end
-
-end:
-  ret void
-}
diff --git a/test/CodeGen/R600/cf_end.ll b/test/CodeGen/R600/cf_end.ll
deleted file mode 100644
index c74ee22868d..00000000000
--- a/test/CodeGen/R600/cf_end.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood --show-mc-encoding | FileCheck --check-prefix=EG %s
-; RUN: llc < %s -march=r600 -mcpu=caicos --show-mc-encoding | FileCheck --check-prefix=EG %s
-; RUN: llc < %s -march=r600 -mcpu=cayman --show-mc-encoding | FileCheck --check-prefix=CM %s
-
-; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80]
-; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88]
-define void @eop() {
-  ret void
-}
diff --git a/test/CodeGen/R600/cgp-addressing-modes.ll b/test/CodeGen/R600/cgp-addressing-modes.ll
deleted file mode 100644
index 77f7bd01b7f..00000000000
--- a/test/CodeGen/R600/cgp-addressing-modes.ll
+++ /dev/null
@@ -1,242 +0,0 @@
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN %s
-
-declare i32 @llvm.r600.read.tidig.x() #0
-
-; OPT-LABEL: @test_sink_global_small_offset_i32(
-; OPT-NOT: getelementptr i32, i32 addrspace(1)* %in
-; OPT: br i1
-; OPT: ptrtoint
-
-; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
-; GCN: {{^}}BB0_2:
-define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
-entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
-  %tmp0 = icmp eq i32 %cond, 0
-  br i1 %tmp0, label %endif, label %if
-
-if:
-  %tmp1 = load i32, i32 addrspace(1)* %in.gep
-  br label %endif
-
-endif:
-  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
-  br label %done
-
-done:
-  ret void
-}
-
-; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset(
-; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
-; OPT: br i1
-
-; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
-; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
-; GCN: {{^}}BB1_2:
-; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
-entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
-  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
-  %tmp0 = icmp eq i32 %cond, 0
-  br i1 %tmp0, label %endif, label %if
-
-if:
-  %tmp1 = load i8, i8 addrspace(1)* %in.gep
-  %tmp2 = sext i8 %tmp1 to i32
-  br label %endif
-
-endif:
-  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
-  br label %done
-
-done:
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
-; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
-; GCN: {{^}}BB2_2:
-; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
-entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
-  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
-  %tmp0 = icmp eq i32 %cond, 0
-  br i1 %tmp0, label %endif, label %if
-
-if:
-  %tmp1 = load i8, i8 addrspace(1)* %in.gep
-  %tmp2 = sext i8 %tmp1 to i32
-  br label %endif
-
-endif:
-  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
-  br label %done
-
-done:
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
-; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
-; GCN: {{^}}BB3_2:
-; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
-entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
-  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
-  %tmp0 = icmp eq i32 %cond, 0
-  br i1 %tmp0, label %endif, label %if
-
-if:
-  %tmp1 = load i8, i8 addrspace(1)* %in.gep
-  %tmp2 = sext i8 %tmp1 to i32
-  br label %endif
-
-endif:
-  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
-  br label %done
-
-done:
-  ret void
-}
-
-; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
-; OPT: getelementptr i32, i32 addrspace(4)* %in
-; OPT: br i1
-; OPT-NOT: ptrtoint
-
-; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
-; GCN: flat_load_dword
-; GCN: {{^}}BB4_2:
-
-define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
-entry:
-  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
-  %tmp0 = icmp eq i32 %cond, 0
-  br i1 %tmp0, label %endif, label %if
-
-if:
-  %tmp1 = load i32, i32 addrspace(4)* %in.gep
-  br label %endif
-
-endif:
-  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(4)* %out.gep
-  br label %done
-
-done:
-  ret void
-}
-
-; OPT-LABEL: @test_sink_scratch_small_offset_i32(
-; OPT-NOT:  getelementptr [512 x i32]
-; OPT: br i1
-; OPT: ptrtoint
-
-; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
-; GCN: s_and_saveexec_b64
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
-; GCN: {{^}}BB5_2:
-define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
-entry:
-  %alloca = alloca [512 x i32], align 4
-  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %add.arg = add i32 %arg, 8
-  %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023
-  %tmp0 = icmp eq i32 %cond, 0
-  br i1 %tmp0, label %endif, label %if
-
-if:
-  store volatile i32 123, i32* %alloca.gep
-  %tmp1 = load volatile i32, i32* %alloca.gep
-  br label %endif
-
-endif:
-  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep.0
-  %load = load volatile i32, i32* %alloca.gep
-  store i32 %load, i32 addrspace(1)* %out.gep.1
-  br label %done
-
-done:
-  ret void
-}
-
-; OPT-LABEL: @test_no_sink_scratch_large_offset_i32(
-; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024
-; OPT: br i1
-; OPT-NOT: ptrtoint
-
-; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32:
-; GCN: s_and_saveexec_b64
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; GCN: {{^}}BB6_2:
-define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
-entry:
-  %alloca = alloca [512 x i32], align 4
-  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %add.arg = add i32 %arg, 8
-  %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024
-  %tmp0 = icmp eq i32 %cond, 0
-  br i1 %tmp0, label %endif, label %if
-
-if:
-  store volatile i32 123, i32* %alloca.gep
-  %tmp1 = load volatile i32, i32* %alloca.gep
-  br label %endif
-
-endif:
-  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep.0
-  %load = load volatile i32, i32* %alloca.gep
-  store i32 %load, i32 addrspace(1)* %out.gep.1
-  br label %done
-
-done:
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
-; GCN: s_and_saveexec_b64
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: {{^}}BB7_2:
-define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) {
-entry:
-  %offset.ext = zext i32 %offset to i64
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext
-  %tmp0 = icmp eq i32 %cond, 0
-  br i1 %tmp0, label %endif, label %if
-
-if:
-  %tmp1 = load i32, i32 addrspace(1)* %in.gep
-  br label %endif
-
-endif:
-  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
-  br label %done
-
-done:
-  ret void
-}
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/coalescer_remat.ll b/test/CodeGen/R600/coalescer_remat.ll
deleted file mode 100644
index 96730bcf2e8..00000000000
--- a/test/CodeGen/R600/coalescer_remat.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -mtriple=amdgcn-- -o - %s | FileCheck %s
-
-declare float @llvm.fma.f32(float, float, float)
-
-; This checks that rematerialization support of the coalescer does not
-; unnecessarily widen the register class. Without those fixes > 20 VGprs
-; are used here
-; Also check that some rematerialization of the 0 constant happened.
-; CHECK-LABEL: foobar
-; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
-; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
-; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
-; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
-; It's probably OK if this is slightly higher:
-; CHECK: ; NumVgprs: 9
-define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
-entry:
-  %cmpflag = icmp eq i32 %flag, 1
-  br i1 %cmpflag, label %loop, label %exit
-
-loop:
-  %c = phi i32 [0, %entry], [%cnext, %loop]
-  %v0 = phi float [0.0, %entry], [%fma.0, %loop]
-  %v1 = phi float [0.0, %entry], [%fma.1, %loop]
-  %v2 = phi float [0.0, %entry], [%fma.2, %loop]
-  %v3 = phi float [0.0, %entry], [%fma.3, %loop]
-
-  ; Try to get the 0 constant to get coalesced into a wide register
-  %blup = insertelement <4 x float> undef, float %v0, i32 0
-  store <4 x float> %blup, <4 x float> addrspace(1)* %out
-
-  %load = load <4 x float>, <4 x float> addrspace(1)* %in
-  %load.0 = extractelement <4 x float> %load, i32 0
-  %load.1 = extractelement <4 x float> %load, i32 1
-  %load.2 = extractelement <4 x float> %load, i32 2
-  %load.3 = extractelement <4 x float> %load, i32 3
-  %fma.0 = call float @llvm.fma.f32(float %v0, float %load.0, float %v0)
-  %fma.1 = call float @llvm.fma.f32(float %v1, float %load.1, float %v1)
-  %fma.2 = call float @llvm.fma.f32(float %v2, float %load.2, float %v2)
-  %fma.3 = call float @llvm.fma.f32(float %v3, float %load.3, float %v3)
-
-  %cnext = add nsw i32 %c, 1
-  %cmp = icmp eq i32 %cnext, 42
-  br i1 %cmp, label %exit, label %loop
-
-exit:
-  %ev0 = phi float [0.0, %entry], [%fma.0, %loop]
-  %ev1 = phi float [0.0, %entry], [%fma.1, %loop]
-  %ev2 = phi float [0.0, %entry], [%fma.2, %loop]
-  %ev3 = phi float [0.0, %entry], [%fma.3, %loop]
-  %dst.0 = insertelement <4 x float> undef,  float %ev0, i32 0
-  %dst.1 = insertelement <4 x float> %dst.0, float %ev1, i32 1
-  %dst.2 = insertelement <4 x float> %dst.1, float %ev2, i32 2
-  %dst.3 = insertelement <4 x float> %dst.2, float %ev3, i32 3
-  store <4 x float> %dst.3, <4 x float> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll b/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
deleted file mode 100644
index 58517209267..00000000000
--- a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: opt -mtriple=amdgcn-- -codegenprepare -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI-LLC %s
-
-; OPT-LABEL: @test(
-; OPT: mul nsw i32
-; OPT-NEXT: sext
-
-; SI-LLC-LABEL: {{^}}test:
-; SI-LLC: s_mul_i32
-; SI-LLC-NOT: mul
-define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
-entry:
-  %0 = mul nsw i32 %a, 3
-  %1 = sext i32 %0 to i64
-  %2 = getelementptr i8, i8 addrspace(1)* %in, i64 %1
-  store i8 %b, i8 addrspace(1)* %2
-  ret void
-}
diff --git a/test/CodeGen/R600/combine_vloads.ll b/test/CodeGen/R600/combine_vloads.ll
deleted file mode 100644
index 01572afa620..00000000000
--- a/test/CodeGen/R600/combine_vloads.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
-
-;
-; kernel void combine_vloads(global char8* src, global char8* result) {
-;   for (int i = 0; i < 1024; ++i)
-;     result[i] = src[0] + src[1] + src[2] + src[3];
-; }
-;
-
-
-; 128-bit loads instead of many 8-bit
-; EG-LABEL: {{^}}combine_vloads:
-; EG: VTX_READ_128
-; EG: VTX_READ_128
-define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
-entry:
-  br label %for.body
-
-for.exit:                                         ; preds = %for.body
-  ret void
-
-for.body:                                         ; preds = %for.body, %entry
-  %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ]
-  %arrayidx_v4 = bitcast <8 x i8> addrspace(1)* %src to <32 x i8> addrspace(1)*
-  %0 = bitcast <32 x i8> addrspace(1)* %arrayidx_v4 to <8 x i32> addrspace(1)*
-  %vecload2 = load <8 x i32>, <8 x i32> addrspace(1)* %0, align 32
-  %1 = bitcast <8 x i32> %vecload2 to <32 x i8>
-  %tmp5 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %tmp8 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %tmp9 = add nsw <8 x i8> %tmp5, %tmp8
-  %tmp12 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-  %tmp13 = add nsw <8 x i8> %tmp9, %tmp12
-  %tmp16 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  %tmp17 = add nsw <8 x i8> %tmp13, %tmp16
-  %scevgep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %result, i32 %i.01
-  %2 = bitcast <8 x i8> %tmp17 to <2 x i32>
-  %3 = bitcast <8 x i8> addrspace(1)* %scevgep to <2 x i32> addrspace(1)*
-  store <2 x i32> %2, <2 x i32> addrspace(1)* %3, align 8
-  %tmp19 = add nsw i32 %i.01, 1
-  %exitcond = icmp eq i32 %tmp19, 1024
-  br i1 %exitcond, label %for.exit, label %for.body
-}
diff --git a/test/CodeGen/R600/commute-compares.ll b/test/CodeGen/R600/commute-compares.ll
deleted file mode 100644
index 31766047a35..00000000000
--- a/test/CodeGen/R600/commute-compares.ll
+++ /dev/null
@@ -1,697 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-
-declare i32 @llvm.r600.read.tidig.x() #0
-
-; --------------------------------------------------------------------------------
-; i32 compares
-; --------------------------------------------------------------------------------
-
-; GCN-LABEL: {{^}}commute_eq_64_i32:
-; GCN: v_cmp_eq_i32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
-  %cmp = icmp eq i32 %val, 64
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ne_64_i32:
-; GCN: v_cmp_ne_i32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
-  %cmp = icmp ne i32 %val, 64
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; FIXME: Why isn't this being folded as a constant?
-; GCN-LABEL: {{^}}commute_ne_litk_i32:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
-; GCN: v_cmp_ne_i32_e32 vcc, [[K]], v{{[0-9]+}}
-define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
-  %cmp = icmp ne i32 %val, 12345
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ugt_64_i32:
-; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
-  %cmp = icmp ugt i32 %val, 64
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_uge_64_i32:
-; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
-define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
-  %cmp = icmp uge i32 %val, 64
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ult_64_i32:
-; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
-  %cmp = icmp ult i32 %val, 64
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ule_63_i32:
-; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
-  %cmp = icmp ule i32 %val, 63
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm
-
-; GCN-LABEL: {{^}}commute_ule_64_i32:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
-; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
-  %cmp = icmp ule i32 %val, 64
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
-; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}}
-define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
-  %cmp = icmp sgt i32 %val, -1
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_sge_neg2_i32:
-; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
-define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
-  %cmp = icmp sge i32 %val, -2
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_slt_neg16_i32:
-; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
-define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
-  %cmp = icmp slt i32 %val, -16
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_sle_5_i32:
-; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
-define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
-  %cmp = icmp sle i32 %val, 5
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; --------------------------------------------------------------------------------
-; i64 compares
-; --------------------------------------------------------------------------------
-
-; GCN-LABEL: {{^}}commute_eq_64_i64:
-; GCN: v_cmp_eq_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
-  %cmp = icmp eq i64 %val, 64
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ne_64_i64:
-; GCN: v_cmp_ne_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
-  %cmp = icmp ne i64 %val, 64
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ugt_64_i64:
-; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
-  %cmp = icmp ugt i64 %val, 64
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_uge_64_i64:
-; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
-  %cmp = icmp uge i64 %val, 64
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ult_64_i64:
-; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
-  %cmp = icmp ult i64 %val, 64
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ule_63_i64:
-; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
-  %cmp = icmp ule i64 %val, 63
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm
-
-; GCN-LABEL: {{^}}commute_ule_64_i64:
-; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
-; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
-  %cmp = icmp ule i64 %val, 64
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
-; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
-  %cmp = icmp sgt i64 %val, -1
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_sge_neg2_i64:
-; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
-  %cmp = icmp sge i64 %val, -2
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_slt_neg16_i64:
-; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
-  %cmp = icmp slt i64 %val, -16
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_sle_5_i64:
-; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
-  %cmp = icmp sle i64 %val, 5
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; --------------------------------------------------------------------------------
-; f32 compares
-; --------------------------------------------------------------------------------
-
-
-; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
-; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
-  %cmp = fcmp oeq float %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-
-; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
-; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
-  %cmp = fcmp ogt float %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_oge_2.0_f32:
-; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
-  %cmp = fcmp oge float %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_olt_2.0_f32:
-; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
-  %cmp = fcmp olt float %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ole_2.0_f32:
-; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
-  %cmp = fcmp ole float %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_one_2.0_f32:
-; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
-  %cmp = fcmp one float %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ord_2.0_f32:
-; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
-  %cmp = fcmp ord float %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
-; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
-  %cmp = fcmp ueq float %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
-; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
-  %cmp = fcmp ugt float %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_uge_2.0_f32:
-; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
-  %cmp = fcmp uge float %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ult_2.0_f32:
-; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
-  %cmp = fcmp ult float %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ule_2.0_f32:
-; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
-  %cmp = fcmp ule float %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_une_2.0_f32:
-; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
-  %cmp = fcmp une float %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_uno_2.0_f32:
-; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
-  %cmp = fcmp uno float %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; --------------------------------------------------------------------------------
-; f64 compares
-; --------------------------------------------------------------------------------
-
-
-; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
-; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
-  %cmp = fcmp oeq double %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-
-; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
-; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
-  %cmp = fcmp ogt double %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_oge_2.0_f64:
-; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
-  %cmp = fcmp oge double %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_olt_2.0_f64:
-; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
-  %cmp = fcmp olt double %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ole_2.0_f64:
-; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
-  %cmp = fcmp ole double %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_one_2.0_f64:
-; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
-  %cmp = fcmp one double %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ord_2.0_f64:
-; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
-  %cmp = fcmp ord double %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
-; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
-  %cmp = fcmp ueq double %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
-; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
-  %cmp = fcmp ugt double %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_uge_2.0_f64:
-; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
-  %cmp = fcmp uge double %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ult_2.0_f64:
-; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
-  %cmp = fcmp ult double %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_ule_2.0_f64:
-; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
-  %cmp = fcmp ule double %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_une_2.0_f64:
-; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
-  %cmp = fcmp une double %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_uno_2.0_f64:
-; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
-  %cmp = fcmp uno double %val, 2.0
-  %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/commute_modifiers.ll b/test/CodeGen/R600/commute_modifiers.ll
deleted file mode 100644
index 7fc36eabb78..00000000000
--- a/test/CodeGen/R600/commute_modifiers.ll
+++ /dev/null
@@ -1,181 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() #1
-declare float @llvm.fabs.f32(float) #1
-declare float @llvm.fma.f32(float, float, float) nounwind readnone
-
-; FUNC-LABEL: @commute_add_imm_fabs_f32
-; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, |[[X]]|
-; SI-NEXT: buffer_store_dword [[REG]]
-define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %x = load float, float addrspace(1)* %gep.0
-  %x.fabs = call float @llvm.fabs.f32(float %x) #1
-  %z = fadd float 2.0, %x.fabs
-  store float %z, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: @commute_mul_imm_fneg_fabs_f32
-; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: v_mul_f32_e64 [[REG:v[0-9]+]], -4.0, |[[X]]|
-; SI-NEXT: buffer_store_dword [[REG]]
-define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %x = load float, float addrspace(1)* %gep.0
-  %x.fabs = call float @llvm.fabs.f32(float %x) #1
-  %x.fneg.fabs = fsub float -0.000000e+00, %x.fabs
-  %z = fmul float 4.0, %x.fneg.fabs
-  store float %z, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: @commute_mul_imm_fneg_f32
-; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
-; SI-NEXT: buffer_store_dword [[REG]]
-define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %x = load float, float addrspace(1)* %gep.0
-  %x.fneg = fsub float -0.000000e+00, %x
-  %z = fmul float 4.0, %x.fneg
-  store float %z, float addrspace(1)* %out
-  ret void
-}
-
-; FIXME: Should use SGPR for literal.
-; FUNC-LABEL: @commute_add_lit_fabs_f32
-; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
-; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
-; SI-NEXT: buffer_store_dword [[REG]]
-define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %x = load float, float addrspace(1)* %gep.0
-  %x.fabs = call float @llvm.fabs.f32(float %x) #1
-  %z = fadd float 1024.0, %x.fabs
-  store float %z, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: @commute_add_fabs_f32
-; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
-; SI-NEXT: buffer_store_dword [[REG]]
-define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
-  %y.fabs = call float @llvm.fabs.f32(float %y) #1
-  %z = fadd float %x, %y.fabs
-  store float %z, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: @commute_mul_fneg_f32
-; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
-; SI-NEXT: buffer_store_dword [[REG]]
-define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
-  %y.fneg = fsub float -0.000000e+00, %y
-  %z = fmul float %x, %y.fneg
-  store float %z, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: @commute_mul_fabs_fneg_f32
-; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
-; SI-NEXT: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
-  %y.fabs = call float @llvm.fabs.f32(float %y) #1
-  %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
-  %z = fmul float %x, %y.fabs.fneg
-  store float %z, float addrspace(1)* %out
-  ret void
-}
-
-; There's no reason to commute this.
-; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32
-; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
-; SI-NEXT: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
-  %x.fabs = call float @llvm.fabs.f32(float %x) #1
-  %y.fabs = call float @llvm.fabs.f32(float %y) #1
-  %z = fmul float %x.fabs, %y.fabs
-  store float %z, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32
-; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
-; SI-NEXT: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
-  %x.fabs = call float @llvm.fabs.f32(float %x) #1
-  %y.fabs = call float @llvm.fabs.f32(float %y) #1
-  %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
-  %z = fmul float %x.fabs, %y.fabs.fneg
-  store float %z, float addrspace(1)* %out
-  ret void
-}
-
-; Make sure we commute the multiply part for the constant in src0 even
-; though we have negate modifier on src2.
-
-; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32
-; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], |[[R2]]|
-; SI: buffer_store_dword [[RESULT]]
-define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
-
-  %r2.fabs = call float @llvm.fabs.f32(float %r2)
-
-  %r3 = tail call float @llvm.fma.f32(float %r1, float 2.0, float %r2.fabs)
-  store float %r3, float addrspace(1)* %gep.out
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/complex-folding.ll b/test/CodeGen/R600/complex-folding.ll
deleted file mode 100644
index a5399a71324..00000000000
--- a/test/CodeGen/R600/complex-folding.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; CHECK: {{^}}main:
-; CHECK-NOT: MOV
-define void @main(<4 x float> inreg %reg0) #0 {
-entry:
-  %0 = extractelement <4 x float> %reg0, i32 0
-  %1 = call float @fabs(float %0)
-  %2 = fptoui float %1 to i32
-  %3 = bitcast i32 %2 to float
-  %4 = insertelement <4 x float> undef, float %3, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %4, i32 0, i32 0)
-  ret void
-}
-
-declare float @fabs(float ) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
diff --git a/test/CodeGen/R600/concat_vectors.ll b/test/CodeGen/R600/concat_vectors.ll
deleted file mode 100644
index a09ed1f7385..00000000000
--- a/test/CodeGen/R600/concat_vectors.ll
+++ /dev/null
@@ -1,296 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_concat_v1i32:
-; 0x80f000 is the high 32 bits of the resource descriptor used by MUBUF
-; instructions that access scratch memory.  Bit 23, which is the add_tid_enable
-; bit, is only set for scratch access, so we can check for the absence of this
-; value if we want to ensure scratch memory is not being used.
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
-  %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
-  store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v2i32:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
-  %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v4i32:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
-  %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v8i32:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
-  %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v16i32:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
-  %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v1f32:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
-  %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> <i32 0, i32 1>
-  store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v2f32:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
-  %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v4f32:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
-  %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v8f32:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
-  %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v16f32:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
-  %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v1i64:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
-  %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
-  store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v2i64:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
-  %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v4i64:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
-  %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v8i64:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
-  %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v16i64:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
-  %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v1f64:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
-  %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
-  store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v2f64:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
-  %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v4f64:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
-  %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v8f64:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
-  %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v16f64:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
-  %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v1i1:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
-  %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> <i32 0, i32 1>
-  store <2 x i1> %concat, <2 x i1> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v2i1:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
-  %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i1> %concat, <4 x i1> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v4i1:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
-  %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x i1> %concat, <8 x i1> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v8i1:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
-  %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  store <16 x i1> %concat, <16 x i1> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v16i1:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
-  %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  store <32 x i1> %concat, <32 x i1> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v32i1:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
-  %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-  store <64 x i1> %concat, <64 x i1> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v1i16:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
-  %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
-  store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v2i16:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
-  %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v4i16:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
-  %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v8i16:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
-  %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_concat_v16i16:
-; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
-; SI-NOT: movrel
-define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
-  %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64
-  ret void
-}
-
-; FUNC-LABEL: {{^}}concat_vector_crash:
-; SI: s_endpgm
-define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
-bb:
-  %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
-  %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %tmp2 = shufflevector <8 x float> undef, <8 x float> %tmp1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-  store <8 x float> %tmp2, <8 x float> addrspace(1)* %out, align 32
-  ret void
-}
diff --git a/test/CodeGen/R600/copy-illegal-type.ll b/test/CodeGen/R600/copy-illegal-type.ll
deleted file mode 100644
index 8b397566066..00000000000
--- a/test/CodeGen/R600/copy-illegal-type.ll
+++ /dev/null
@@ -1,167 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_copy_v4i8:
-; SI: buffer_load_dword [[REG:v[0-9]+]]
-; SI: buffer_store_dword [[REG]]
-; SI: s_endpgm
-define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_copy_v4i8_x2:
-; SI: buffer_load_dword [[REG:v[0-9]+]]
-; SI: buffer_store_dword [[REG]]
-; SI: buffer_store_dword [[REG]]
-; SI: s_endpgm
-define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_copy_v4i8_x3:
-; SI: buffer_load_dword [[REG:v[0-9]+]]
-; SI: buffer_store_dword [[REG]]
-; SI: buffer_store_dword [[REG]]
-; SI: buffer_store_dword [[REG]]
-; SI: s_endpgm
-define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_copy_v4i8_x4:
-; SI: buffer_load_dword [[REG:v[0-9]+]]
-; SI: buffer_store_dword [[REG]]
-; SI: buffer_store_dword [[REG]]
-; SI: buffer_store_dword [[REG]]
-; SI: buffer_store_dword [[REG]]
-; SI: s_endpgm
-define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI_DAG: buffer_store_byte
-
-; After scalarizing v4i8 loads is fixed.
-; XSI: buffer_load_dword
-; XSI: V_BFE
-; XSI: V_ADD
-; XSI: V_ADD
-; XSI: V_ADD
-; XSI: buffer_store_dword
-; XSI: buffer_store_dword
-
-; SI: s_endpgm
-define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
-  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
-  store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI_DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI_DAG: buffer_store_byte
-
-; XSI: buffer_load_dword
-; XSI: BFE
-; XSI: buffer_store_dword
-; XSI: V_ADD
-; XSI: buffer_store_dword
-; XSI-NEXT: buffer_store_dword
-
-; SI: s_endpgm
-define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
-  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
-  store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_copy_v3i8:
-; SI-NOT: bfe
-; SI-NOT: bfi
-; SI: s_endpgm
-define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
-  %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
-  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: s_endpgm
-define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: s_endpgm
-define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
-  store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/copy-to-reg.ll b/test/CodeGen/R600/copy-to-reg.ll
deleted file mode 100644
index fc875f6ef7a..00000000000
--- a/test/CodeGen/R600/copy-to-reg.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s
-
-; Test that CopyToReg instructions don't have non-register operands prior
-; to being emitted.
-
-; Make sure this doesn't crash
-; CHECK-LABEL: {{^}}copy_to_reg_frameindex:
-define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
-entry:
-  %alloca = alloca [16 x i32]
-  br label %loop
-
-loop:
-  %inc = phi i32 [0, %entry], [%inc.i, %loop]
-  %ptr = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %inc
-  store i32 %inc, i32* %ptr
-  %inc.i = add i32 %inc, 1
-  %cnd = icmp uge i32 %inc.i, 16
-  br i1 %cnd, label %done, label %loop
-
-done:
-  %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 0
-  %tmp1 = load i32, i32* %tmp0
-  store i32 %tmp1, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/ctlz_zero_undef.ll b/test/CodeGen/R600/ctlz_zero_undef.ll
deleted file mode 100644
index bd26c302fe5..00000000000
--- a/test/CodeGen/R600/ctlz_zero_undef.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
-declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
-declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
-
-; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i32:
-; SI: s_load_dword [[VAL:s[0-9]+]],
-; SI: s_flbit_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
-; SI: s_endpgm
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
-; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
-  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
-  store i32 %ctlz, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
-; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr, align 4
-  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
-  store i32 %ctlz, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32:
-; SI: buffer_load_dwordx2
-; SI: v_ffbh_u32_e32
-; SI: v_ffbh_u32_e32
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
-; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
-  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
-  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32:
-; SI: buffer_load_dwordx4
-; SI: v_ffbh_u32_e32
-; SI: v_ffbh_u32_e32
-; SI: v_ffbh_u32_e32
-; SI: v_ffbh_u32_e32
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
-; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
-  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
-  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
-  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
-  ret void
-}
diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll
deleted file mode 100644
index 0a031c5e24d..00000000000
--- a/test/CodeGen/R600/ctpop.ll
+++ /dev/null
@@ -1,300 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
-declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
-declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
-
-; FUNC-LABEL: {{^}}s_ctpop_i32:
-; GCN: s_load_dword [[SVAL:s[0-9]+]],
-; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
-; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; GCN: buffer_store_dword [[VRESULT]],
-; GCN: s_endpgm
-
-; EG: BCNT_INT
-define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
-  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  store i32 %ctpop, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; XXX - Why 0 in register?
-; FUNC-LABEL: {{^}}v_ctpop_i32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-
-; EG: BCNT_INT
-define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  store i32 %ctpop, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
-; GCN: buffer_load_dword [[VAL1:v[0-9]+]],
-; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
-; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
-; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-
-; EG: BCNT_INT
-; EG: BCNT_INT
-define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
-  %val0 = load i32, i32 addrspace(1)* %in0, align 4
-  %val1 = load i32, i32 addrspace(1)* %in1, align 4
-  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
-  %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
-  %add = add i32 %ctpop0, %ctpop1
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
-; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
-; GCN-NEXT: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
-  %val0 = load i32, i32 addrspace(1)* %in0, align 4
-  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
-  %add = add i32 %ctpop0, %sval
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_v2i32:
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: s_endpgm
-
-; EG: BCNT_INT
-; EG: BCNT_INT
-define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
-  %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
-  store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_v4i32:
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: s_endpgm
-
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
-  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
-  %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
-  store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_v8i32:
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: s_endpgm
-
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
-  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
-  %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
-  store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_v16i32:
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: s_endpgm
-
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-; EG: BCNT_INT
-define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
-  %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
-  %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
-  store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-
-; EG: BCNT_INT
-define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  %add = add i32 %ctpop, 4
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-
-; EG: BCNT_INT
-define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  %add = add i32 4, %ctpop
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
-; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
-; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  %add = add i32 %ctpop, 99999
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-
-; EG: BCNT_INT
-define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  %add = add i32 %ctpop, %const
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-
-; EG: BCNT_INT
-define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  %add = add i32 %const, %ctpop
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}}
-; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:16
-; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-
-; EG: BCNT_INT
-define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
-  %const = load i32, i32 addrspace(1)* %gep, align 4
-  %add = add i32 %const, %ctpop
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FIXME: We currently disallow SALU instructions in all branches,
-; but there are some cases when the should be allowed.
-
-; FUNC-LABEL: {{^}}ctpop_i32_in_br:
-; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
-; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
-; GCN: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[VAL]]
-; GCN: v_mov_b32_e32 [[RESULT]], [[SRESULT]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-; EG: BCNT_INT
-define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
-entry:
-  %tmp0 = icmp eq i32 %cond, 0
-  br i1 %tmp0, label %if, label %else
-
-if:
-  %tmp2 = call i32 @llvm.ctpop.i32(i32 %ctpop_arg)
-  br label %endif
-
-else:
-  %tmp3 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %tmp4 = load i32, i32 addrspace(1)* %tmp3
-  br label %endif
-
-endif:
-  %tmp5 = phi i32 [%tmp2, %if], [%tmp4, %else]
-  store i32 %tmp5, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/ctpop64.ll b/test/CodeGen/R600/ctpop64.ll
deleted file mode 100644
index e1a0ee3ea21..00000000000
--- a/test/CodeGen/R600/ctpop64.ll
+++ /dev/null
@@ -1,124 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
-
-declare i64 @llvm.ctpop.i64(i64) nounwind readnone
-declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
-declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
-declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
-declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
-
-; FUNC-LABEL: {{^}}s_ctpop_i64:
-; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
-; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; GCN: buffer_store_dword [[VRESULT]],
-; GCN: s_endpgm
-define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
-  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
-  %truncctpop = trunc i64 %ctpop to i32
-  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_i64:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
-; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
-; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
-; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
-  %val = load i64, i64 addrspace(1)* %in, align 8
-  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
-  %truncctpop = trunc i64 %ctpop to i32
-  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_ctpop_v2i64:
-; GCN: s_bcnt1_i32_b64
-; GCN: s_bcnt1_i32_b64
-; GCN: s_endpgm
-define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
-  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
-  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
-  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_ctpop_v4i64:
-; GCN: s_bcnt1_i32_b64
-; GCN: s_bcnt1_i32_b64
-; GCN: s_bcnt1_i32_b64
-; GCN: s_bcnt1_i32_b64
-; GCN: s_endpgm
-define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
-  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
-  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
-  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_v2i64:
-; GCN: v_bcnt_u32_b32
-; GCN: v_bcnt_u32_b32
-; GCN: v_bcnt_u32_b32
-; GCN: v_bcnt_u32_b32
-; GCN: s_endpgm
-define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
-  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
-  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
-  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
-  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ctpop_v4i64:
-; GCN: v_bcnt_u32_b32
-; GCN: v_bcnt_u32_b32
-; GCN: v_bcnt_u32_b32
-; GCN: v_bcnt_u32_b32
-; GCN: v_bcnt_u32_b32
-; GCN: v_bcnt_u32_b32
-; GCN: v_bcnt_u32_b32
-; GCN: v_bcnt_u32_b32
-; GCN: s_endpgm
-define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
-  %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
-  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
-  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
-  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FIXME: We currently disallow SALU instructions in all branches,
-; but there are some cases when the should be allowed.
-
-; FUNC-LABEL: {{^}}ctpop_i64_in_br:
-; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
-; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
-; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
-; GCN: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
-; GCN: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
-; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
-; GCN: s_endpgm
-define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
-entry:
-  %tmp0 = icmp eq i32 %cond, 0
-  br i1 %tmp0, label %if, label %else
-
-if:
-  %tmp2 = call i64 @llvm.ctpop.i64(i64 %ctpop_arg)
-  br label %endif
-
-else:
-  %tmp3 = getelementptr i64, i64 addrspace(1)* %in, i32 1
-  %tmp4 = load i64, i64 addrspace(1)* %tmp3
-  br label %endif
-
-endif:
-  %tmp5 = phi i64 [%tmp2, %if], [%tmp4, %else]
-  store i64 %tmp5, i64 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/cttz_zero_undef.ll b/test/CodeGen/R600/cttz_zero_undef.ll
deleted file mode 100644
index 56fcb51fe14..00000000000
--- a/test/CodeGen/R600/cttz_zero_undef.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
-declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
-declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
-
-; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32:
-; SI: s_load_dword [[VAL:s[0-9]+]],
-; SI: s_ff1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
-; SI: s_endpgm
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
-; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
-  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
-  store i32 %cttz, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
-; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr, align 4
-  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
-  store i32 %cttz, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32:
-; SI: buffer_load_dwordx2
-; SI: v_ffbl_b32_e32
-; SI: v_ffbl_b32_e32
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
-; EG: FFBL_INT {{\*? *}}[[RESULT]]
-; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
-  %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
-  store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32:
-; SI: buffer_load_dwordx4
-; SI: v_ffbl_b32_e32
-; SI: v_ffbl_b32_e32
-; SI: v_ffbl_b32_e32
-; SI: v_ffbl_b32_e32
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
-; EG: FFBL_INT {{\*? *}}[[RESULT]]
-; EG: FFBL_INT {{\*? *}}[[RESULT]]
-; EG: FFBL_INT {{\*? *}}[[RESULT]]
-; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
-  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
-  %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
-  store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
-  ret void
-}
diff --git a/test/CodeGen/R600/cvt_f32_ubyte.ll b/test/CodeGen/R600/cvt_f32_ubyte.ll
deleted file mode 100644
index 3399d9da29e..00000000000
--- a/test/CodeGen/R600/cvt_f32_ubyte.ll
+++ /dev/null
@@ -1,196 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}load_i8_to_f32:
-; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]],
-; SI-NOT: bfe
-; SI-NOT: lshr
-; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
-; SI: buffer_store_dword [[CONV]],
-define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
-  %load = load i8, i8 addrspace(1)* %in, align 1
-  %cvt = uitofp i8 %load to float
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}load_v2i8_to_v2f32:
-; SI: buffer_load_ushort [[LOADREG:v[0-9]+]],
-; SI-NOT: bfe
-; SI-NOT: lshr
-; SI-NOT: and
-; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
-; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
-  %cvt = uitofp <2 x i8> %load to <2 x float>
-  store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}load_v3i8_to_v3f32:
-; SI-NOT: bfe
-; SI-NOT: v_cvt_f32_ubyte3_e32
-; SI-DAG: v_cvt_f32_ubyte2_e32
-; SI-DAG: v_cvt_f32_ubyte1_e32
-; SI-DAG: v_cvt_f32_ubyte0_e32
-; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
-  %cvt = uitofp <3 x i8> %load to <3 x float>
-  store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}load_v4i8_to_v4f32:
-; SI: buffer_load_dword [[LOADREG:v[0-9]+]]
-; SI-NOT: bfe
-; SI-NOT: lshr
-; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
-; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
-; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
-; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
-  %cvt = uitofp <4 x i8> %load to <4 x float>
-  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; This should not be adding instructions to shift into the correct
-; position in the word for the component.
-
-; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
-; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
-; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
-; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
-; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
-; SI-NOT: v_lshlrev_b32
-; SI-NOT: v_or_b32
-
-; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]]
-
-; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
-  %cvt = uitofp <4 x i8> %load to <4 x float>
-  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; XXX - This should really still be able to use the v_cvt_f32_ubyte0
-; for each component, but computeKnownBits doesn't handle vectors very
-; well.
-
-; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: v_cvt_f32_ubyte0_e32
-; SI: v_cvt_f32_ubyte0_e32
-; SI: v_cvt_f32_ubyte0_e32
-; SI: v_cvt_f32_ubyte0_e32
-
-; XXX - replace with this when v4i8 loads aren't scalarized anymore.
-; XSI: buffer_load_dword
-; XSI: v_cvt_f32_u32_e32
-; XSI: v_cvt_f32_u32_e32
-; XSI: v_cvt_f32_u32_e32
-; XSI: v_cvt_f32_u32_e32
-; SI: s_endpgm
-define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
-  %cvt = uitofp <4 x i8> %load to <4 x float>
-  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
-  %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
-  store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
-  ret void
-}
-
-; Make sure this doesn't crash.
-; SI-LABEL: {{^}}load_v7i8_to_v7f32:
-; SI: s_endpgm
-define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
-  %cvt = uitofp <7 x i8> %load to <7 x float>
-  store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}load_v8i8_to_v8f32:
-; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
-; SI-NOT: bfe
-; SI-NOT: lshr
-; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
-; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
-; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
-; SI-NOT: bfe
-; SI-NOT: lshr
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
-  %cvt = uitofp <8 x i8> %load to <8 x float>
-  store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
-; SI: buffer_load_dword [[LOADREG:v[0-9]+]],
-; SI: v_add_i32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]]
-; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
-; SI: buffer_store_dword [[CONV]],
-define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 2
-  %inreg = and i32 %add, 255
-  %cvt = uitofp i32 %inreg to float
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
-define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %inreg = and i32 %load, 65280
-  %shr = lshr i32 %inreg, 8
-  %cvt = uitofp i32 %shr to float
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
-
-
-; We don't get these ones because of the zext, but instcombine removes
-; them so it shouldn't really matter.
-define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
-  %load = load i8, i8 addrspace(1)* %in, align 1
-  %ext = zext i8 %load to i32
-  %cvt = uitofp i32 %ext to float
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
-
-define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
-  %ext = zext <4 x i8> %load to <4 x i32>
-  %cvt = uitofp <4 x i32> %ext to <4 x float>
-  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
diff --git a/test/CodeGen/R600/cvt_flr_i32_f32.ll b/test/CodeGen/R600/cvt_flr_i32_f32.ll
deleted file mode 100644
index 2dd3a9f2a77..00000000000
--- a/test/CodeGen/R600/cvt_flr_i32_f32.ll
+++ /dev/null
@@ -1,86 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare float @llvm.fabs.f32(float) #1
-declare float @llvm.floor.f32(float) #1
-
-; FUNC-LABEL: {{^}}cvt_flr_i32_f32_0:
-; SI-SAFE-NOT: v_cvt_flr_i32_f32
-; SI-NOT: add
-; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; SI: s_endpgm
-define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
-  %floor = call float @llvm.floor.f32(float %x) #1
-  %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}cvt_flr_i32_f32_1:
-; SI: v_add_f32_e64 [[TMP:v[0-9]+]], 1.0, s{{[0-9]+}}
-; SI-SAFE-NOT: v_cvt_flr_i32_f32
-; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]]
-; SI: s_endpgm
-define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
-  %fadd = fadd float %x, 1.0
-  %floor = call float @llvm.floor.f32(float %fadd) #1
-  %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs:
-; SI-NOT: add
-; SI-SAFE-NOT: v_cvt_flr_i32_f32
-; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|
-; SI: s_endpgm
-define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
-  %x.fabs = call float @llvm.fabs.f32(float %x) #1
-  %floor = call float @llvm.floor.f32(float %x.fabs) #1
-  %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fneg:
-; SI-NOT: add
-; SI-SAFE-NOT: v_cvt_flr_i32_f32
-; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
-; SI: s_endpgm
-define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
-  %x.fneg = fsub float -0.000000e+00, %x
-  %floor = call float @llvm.floor.f32(float %x.fneg) #1
-  %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs_fneg:
-; SI-NOT: add
-; SI-SAFE-NOT: v_cvt_flr_i32_f32
-; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
-; SI: s_endpgm
-define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
-  %x.fabs = call float @llvm.fabs.f32(float %x) #1
-  %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
-  %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1
-  %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}no_cvt_flr_i32_f32_0:
-; SI-NOT: v_cvt_flr_i32_f32
-; SI: v_floor_f32
-; SI: v_cvt_u32_f32_e32
-; SI: s_endpgm
-define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
-  %floor = call float @llvm.floor.f32(float %x) #1
-  %cvt = fptoui float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/cvt_rpi_i32_f32.ll b/test/CodeGen/R600/cvt_rpi_i32_f32.ll
deleted file mode 100644
index 864ac40260b..00000000000
--- a/test/CodeGen/R600/cvt_rpi_i32_f32.ll
+++ /dev/null
@@ -1,83 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-
-declare float @llvm.fabs.f32(float) #1
-declare float @llvm.floor.f32(float) #1
-
-; FUNC-LABEL: {{^}}cvt_rpi_i32_f32:
-; SI-SAFE-NOT: v_cvt_rpi_i32_f32
-; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; SI: s_endpgm
-define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
-  %fadd = fadd float %x, 0.5
-  %floor = call float @llvm.floor.f32(float %fadd) #1
-  %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs:
-; SI-SAFE-NOT: v_cvt_rpi_i32_f32
-; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
-; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
-  %x.fabs = call float @llvm.fabs.f32(float %x) #1
-  %fadd = fadd float %x.fabs, 0.5
-  %floor = call float @llvm.floor.f32(float %fadd) #1
-  %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
-  ret void
-}
-
-; FIXME: This doesn't work because it forms fsub 0.5, x
-; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fneg:
-; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
-; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, s{{[0-9]+}}
-; SI-SAFE-NOT: v_cvt_flr_i32_f32
-; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
-; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
-  %x.fneg = fsub float -0.000000e+00, %x
-  %fadd = fadd float %x.fneg, 0.5
-  %floor = call float @llvm.floor.f32(float %fadd) #1
-  %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
-  ret void
-}
-
-; FIXME: This doesn't work for same reason as above
-; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs_fneg:
-; SI-SAFE-NOT: v_cvt_rpi_i32_f32
-; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
-
-; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, |s{{[0-9]+}}|
-; SI-SAFE-NOT: v_cvt_flr_i32_f32
-; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
-; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
-  %x.fabs = call float @llvm.fabs.f32(float %x) #1
-  %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
-  %fadd = fadd float %x.fabs.fneg, 0.5
-  %floor = call float @llvm.floor.f32(float %fadd) #1
-  %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}no_cvt_rpi_i32_f32_0:
-; SI-NOT: v_cvt_rpi_i32_f32
-; SI: v_add_f32
-; SI: v_floor_f32
-; SI: v_cvt_u32_f32
-; SI: s_endpgm
-define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
-  %fadd = fadd float %x, 0.5
-  %floor = call float @llvm.floor.f32(float %fadd) #1
-  %cvt = fptoui float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
deleted file mode 100644
index fb43ff4fbdd..00000000000
--- a/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; This test is for a bug in
-; DAGCombiner::reduceBuildVecConvertToConvertBuildVec() where
-; the wrong type was being passed to
-; TargetLowering::getOperationAction() when checking the legality of
-; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes.
-
-
-; CHECK: {{^}}sint:
-; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %sint = load i32, i32 addrspace(1) * %in
-  %conv = sitofp i32 %sint to float
-  %0 = insertelement <4 x float> undef, float %conv, i32 0
-  %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
-  store <4 x float> %splat, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-;CHECK: {{^}}uint:
-;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %uint = load i32, i32 addrspace(1) * %in
-  %conv = uitofp i32 %uint to float
-  %0 = insertelement <4 x float> undef, float %conv, i32 0
-  %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
-  store <4 x float> %splat, <4 x float> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/debug.ll b/test/CodeGen/R600/debug.ll
deleted file mode 100644
index a2e0e878b74..00000000000
--- a/test/CodeGen/R600/debug.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-
-; Test for a crash in the custom assembly dump code.
-
-; SI: s_endpgm
-define void @test(i32 addrspace(1)* %out) {
-  store i32 0, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/default-fp-mode.ll b/test/CodeGen/R600/default-fp-mode.ll
deleted file mode 100644
index da8e91454b9..00000000000
--- a/test/CodeGen/R600/default-fp-mode.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_kernel:
-
-; DEFAULT: FloatMode: 192
-; DEFAULT: IeeeMode: 0
-
-; FP64-DENORMAL: FloatMode: 192
-; FP64-DENORMAL: IeeeMode: 0
-
-; FP32-DENORMAL: FloatMode: 48
-; FP32-DENORMAL: IeeeMode: 0
-
-; BOTH-DENORMAL: FloatMode: 240
-; BOTH-DENORMAL: IeeeMode: 0
-
-; NO-DENORMAL: FloatMode: 0
-; NO-DENORMAL: IeeeMode: 0
-define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
-  ret void
-}
diff --git a/test/CodeGen/R600/disconnected-predset-break-bug.ll b/test/CodeGen/R600/disconnected-predset-break-bug.ll
deleted file mode 100644
index cdd2c0cd4f4..00000000000
--- a/test/CodeGen/R600/disconnected-predset-break-bug.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; PRED_SET* instructions must be tied to any instruction that uses their
-; result.  This tests that there are no instructions between the PRED_SET*
-; and the PREDICATE_BREAK in this loop.
-
-; CHECK: {{^}}loop_ge:
-; CHECK: LOOP_START_DX10
-; CHECK: ALU_PUSH_BEFORE
-; CHECK-NEXT: JUMP
-; CHECK-NEXT: LOOP_BREAK
-define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
-entry:
-  %cmp5 = icmp sgt i32 %iterations, 0
-  br i1 %cmp5, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.body, %entry
-  %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ]
-  %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
-  %i.07 = add nsw i32 %i.07.in, -1
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06
-  store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4
-  %add = add nsw i32 %ai.06, 1
-  %exitcond = icmp eq i32 %add, %iterations
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
diff --git a/test/CodeGen/R600/dot4-folding.ll b/test/CodeGen/R600/dot4-folding.ll
deleted file mode 100644
index 4df7b63bf98..00000000000
--- a/test/CodeGen/R600/dot4-folding.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; Exactly one constant vector can be folded into dot4, which means exactly
-; 4 MOV instructions
-; CHECK: {{^}}main:
-; CHECK: MOV
-; CHECK: MOV
-; CHECK: MOV
-; CHECK: MOV
-; CHECK-NOT: MOV
-; CHECK-NOT: MOV
-; CHECK-NOT: MOV
-; CHECK-NOT: MOV
-
-define void @main(float addrspace(1)* %out) {
-main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* null
-  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %2 = call float @llvm.AMDGPU.dp4(<4 x float> %0,<4 x float> %1)
-  %3 = insertelement <4 x float> undef, float %2, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %3, i32 0, i32 0)
-  ret void
-}
-
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
deleted file mode 100644
index e7e13d6178c..00000000000
--- a/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare void @llvm.AMDGPU.barrier.local() #1
-
-; Function Attrs: nounwind
-; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop:
-; CHECK: BB0_1:
-; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]],
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]]
-; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], 4, [[VADDR]]
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]]
-; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], 0x80, [[VADDR]]
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]]
-; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], 0x84, [[VADDR]]
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x84]]
-; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], 0x100, [[VADDR]]
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
-
-; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:1
-; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:33
-; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
-; CHECK: s_endpgm
-define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
-entry:
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #0
-  %mul = shl nsw i32 %x.i, 1
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %sum.03 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ]
-  %offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ]
-  %k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  tail call void @llvm.AMDGPU.barrier.local() #1
-  %arrayidx = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %offset.02
-  %tmp = load float, float addrspace(3)* %arrayidx, align 4
-  %add1 = add nsw i32 %offset.02, 1
-  %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add1
-  %tmp1 = load float, float addrspace(3)* %arrayidx2, align 4
-  %add3 = add nsw i32 %offset.02, 32
-  %arrayidx4 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add3
-  %tmp2 = load float, float addrspace(3)* %arrayidx4, align 4
-  %add5 = add nsw i32 %offset.02, 33
-  %arrayidx6 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add5
-  %tmp3 = load float, float addrspace(3)* %arrayidx6, align 4
-  %add7 = add nsw i32 %offset.02, 64
-  %arrayidx8 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add7
-  %tmp4 = load float, float addrspace(3)* %arrayidx8, align 4
-  %add9 = fadd float %tmp, %tmp1
-  %add10 = fadd float %add9, %tmp2
-  %add11 = fadd float %add10, %tmp3
-  %add12 = fadd float %add11, %tmp4
-  %add13 = fadd float %sum.03, %add12
-  %inc = add nsw i32 %k.01, 1
-  %add14 = add nsw i32 %offset.02, 97
-  %exitcond = icmp eq i32 %inc, 8
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  %tmp5 = sext i32 %x.i to i64
-  %arrayidx15 = getelementptr inbounds float, float addrspace(1)* %out, i64 %tmp5
-  store float %add13, float addrspace(1)* %arrayidx15, align 4
-  ret void
-}
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { noduplicate nounwind }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/R600/ds_read2.ll b/test/CodeGen/R600/ds_read2.ll
deleted file mode 100644
index 5929898f8bd..00000000000
--- a/test/CodeGen/R600/ds_read2.ll
+++ /dev/null
@@ -1,515 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
-
-; FIXME: We don't get cases where the address was an SGPR because we
-; get a copy to the address register for each one.
-
-@lds = addrspace(3) global [512 x float] undef, align 4
- @lds.f64 = addrspace(3) global [512 x double] undef, align 8
-
-; SI-LABEL: @simple_read2_f32
-; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @simple_read2_f32(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_read2_f32_max_offset
-; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 255
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_read2_f32_too_far
-; SI-NOT ds_read2_b32
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
-; SI: s_endpgm
-define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 257
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_read2_f32_x2
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
-; SI: s_endpgm
-define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 0
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-
-  %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  %sum.0 = fadd float %val0, %val1
-
-  %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
-  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
-
-  %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
-  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
-  %sum.1 = fadd float %val2, %val3
-
-  %sum = fadd float %sum.0, %sum.1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; Make sure there is an instruction between the two sets of reads.
-; SI-LABEL: @simple_read2_f32_x2_barrier
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
-; SI: s_barrier
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
-; SI: s_endpgm
-define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 0
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-
-  %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  %sum.0 = fadd float %val0, %val1
-
-  call void @llvm.AMDGPU.barrier.local() #2
-
-  %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
-  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
-
-  %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
-  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
-  %sum.1 = fadd float %val2, %val3
-
-  %sum = fadd float %sum.0, %sum.1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; For some reason adding something to the base address for the first
-; element results in only folding the inner pair.
-
-; SI-LABEL: @simple_read2_f32_x2_nonzero_base
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
-; SI: s_endpgm
-define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-
-  %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  %sum.0 = fadd float %val0, %val1
-
-  %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
-  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
-
-  %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
-  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
-  %sum.1 = fadd float %val2, %val3
-
-  %sum = fadd float %sum.0, %sum.1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; Be careful of vectors of pointers. We don't know if the 2 pointers
-; in the vectors are really the same base, so this is not safe to
-; merge.
-; Base pointers come from different subregister of same super
-; register. We can't safely merge this.
-
-; SI-LABEL: @read2_ptr_is_subreg_arg_f32
-; SI-NOT: ds_read2_b32
-; SI: ds_read_b32
-; SI: ds_read_b32
-; SI: s_endpgm
-define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
-  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
-  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
-  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
-  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
-  %val0 = load float, float addrspace(3)* %gep.0, align 4
-  %val1 = load float, float addrspace(3)* %gep.1, align 4
-  %add.x = add nsw i32 %x.i, 8
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; Apply a constant scalar offset after the pointer vector extract.  We
-; are rejecting merges that have the same, constant 0 offset, so make
-; sure we are really rejecting it because of the different
-; subregisters.
-
-; SI-LABEL: @read2_ptr_is_subreg_arg_offset_f32
-; SI-NOT: ds_read2_b32
-; SI: ds_read_b32
-; SI: ds_read_b32
-; SI: s_endpgm
-define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
-  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
-  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
-  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
-  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
-
-  ; Apply an additional offset after the vector that will be more obviously folded.
-  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
-
-  %val0 = load float, float addrspace(3)* %gep.0, align 4
-  %val1 = load float, float addrspace(3)* %gep.1.offset, align 4
-  %add.x = add nsw i32 %x.i, 8
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; We should be able to merge in this case, but probably not worth the effort.
-; SI-NOT: ds_read2_b32
-; SI: ds_read_b32
-; SI: ds_read_b32
-; SI: s_endpgm
-define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
-  %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1
-  %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
-  %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1
-  %idx = add <2 x i32> %x.i.v.1, <i32 0, i32 8>
-  %gep = getelementptr inbounds [512 x float], <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx
-  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
-  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
-  %val0 = load float, float addrspace(3)* %gep.0, align 4
-  %val1 = load float, float addrspace(3)* %gep.1, align 4
-  %add.x = add nsw i32 %x.i, 8
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_read2_f32_volatile_0
-; SI-NOT ds_read2_b32
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
-; SI: s_endpgm
-define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_read2_f32_volatile_1
-; SI-NOT ds_read2_b32
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
-; SI: s_endpgm
-define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load volatile float, float addrspace(3)* %arrayidx1, align 4
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; Can't fold since not correctly aligned.
-; XXX: This isn't really testing anything useful now. I think CI
-; allows unaligned LDS accesses, which would be a problem here.
-; SI-LABEL: @unaligned_read2_f32
-; SI-NOT: ds_read2_b32
-; SI: s_endpgm
-define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 1
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 1
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; SI-LABEL: @misaligned_2_simple_read2_f32
-; SI-NOT: ds_read2_b32
-; SI: s_endpgm
-define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 2
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 2
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_read2_f64
-; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
-; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
-; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
-; SI: buffer_store_dwordx2 [[RESULT]]
-; SI: s_endpgm
-define void @simple_read2_f64(double addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
-  %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
-  ret void
-}
-
-; SI-LABEL: @simple_read2_f64_max_offset
-; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
-; SI: s_endpgm
-define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
-  %add.x = add nsw i32 %x.i, 255
-  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
-  %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
-  ret void
-}
-
-; SI-LABEL: @simple_read2_f64_too_far
-; SI-NOT ds_read2_b64
-; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
-; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
-; SI: s_endpgm
-define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
-  %add.x = add nsw i32 %x.i, 257
-  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
-  %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
-  ret void
-}
-
-; Alignment only 4
-; SI-LABEL: @misaligned_read2_f64
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
-; SI: s_endpgm
-define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 7
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 4
-  %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-@foo = addrspace(3) global [4 x i32] undef, align 4
-
-; SI-LABEL: @load_constant_adjacent_offsets
-; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
-define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
-  %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
-  %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
-  %sum = add i32 %val0, %val1
-  store i32 %sum, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: @load_constant_disjoint_offsets
-; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
-define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
-  %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
-  %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
-  %sum = add i32 %val0, %val1
-  store i32 %sum, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-@bar = addrspace(3) global [4 x i64] undef, align 4
-
-; SI-LABEL: @load_misaligned64_constant_offsets
-; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
-define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
-  %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
-  %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
-  %sum = add i64 %val0, %val1
-  store i64 %sum, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-@bar.large = addrspace(3) global [4096 x i64] undef, align 4
-
-; SI-LABEL: @load_misaligned64_constant_large_offsets
-; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
-; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
-; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
-; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
-; SI: s_endpgm
-define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
-  %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
-  %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
-  %sum = add i64 %val0, %val1
-  store i64 %sum, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
-@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
-
-define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
-  %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
-  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
-  %tmp16 = load float, float addrspace(3)* %arrayidx44, align 4
-  %add47 = add nsw i32 %x.i, 1
-  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
-  %tmp17 = load float, float addrspace(3)* %arrayidx48, align 4
-  %add51 = add nsw i32 %x.i, 16
-  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
-  %tmp18 = load float, float addrspace(3)* %arrayidx52, align 4
-  %add55 = add nsw i32 %x.i, 17
-  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
-  %tmp19 = load float, float addrspace(3)* %arrayidx56, align 4
-  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
-  %tmp20 = load float, float addrspace(3)* %arrayidx60, align 4
-  %add63 = add nsw i32 %y.i, 1
-  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
-  %tmp21 = load float, float addrspace(3)* %arrayidx64, align 4
-  %add67 = add nsw i32 %y.i, 32
-  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
-  %tmp22 = load float, float addrspace(3)* %arrayidx68, align 4
-  %add71 = add nsw i32 %y.i, 33
-  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
-  %tmp23 = load float, float addrspace(3)* %arrayidx72, align 4
-  %add75 = add nsw i32 %y.i, 64
-  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
-  %tmp24 = load float, float addrspace(3)* %arrayidx76, align 4
-  %add79 = add nsw i32 %y.i, 65
-  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
-  %tmp25 = load float, float addrspace(3)* %arrayidx80, align 4
-  %sum.0 = fadd float %tmp16, %tmp17
-  %sum.1 = fadd float %sum.0, %tmp18
-  %sum.2 = fadd float %sum.1, %tmp19
-  %sum.3 = fadd float %sum.2, %tmp20
-  %sum.4 = fadd float %sum.3, %tmp21
-  %sum.5 = fadd float %sum.4, %tmp22
-  %sum.6 = fadd float %sum.5, %tmp23
-  %sum.7 = fadd float %sum.6, %tmp24
-  %sum.8 = fadd float %sum.7, %tmp25
-  store float %sum.8, float addrspace(1)* %C, align 4
-  ret void
-}
-
-define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
-  %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
-  store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
-  %load = load i64, i64 addrspace(3)* %in, align 4
-  store i64 %load, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
-
-; Function Attrs: noduplicate nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/R600/ds_read2_offset_order.ll b/test/CodeGen/R600/ds_read2_offset_order.ll
deleted file mode 100644
index 9ea9a5a2617..00000000000
--- a/test/CodeGen/R600/ds_read2_offset_order.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
-
-; XFAIL: *
-
-@lds = addrspace(3) global [512 x float] undef, align 4
-
-; SI-LABEL: {{^}}offset_order:
-
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56
-; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:0 offset1:4
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:1
-
-define void @offset_order(float addrspace(1)* %out) {
-entry:
-  %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0
-  %val0 = load float, float addrspace(3)* %ptr0
-
-  %ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 256
-  %val1 = load float, float addrspace(3)* %ptr1
-  %add1 = fadd float %val0, %val1
-
-  %ptr2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 3
-  %val2 = load float, float addrspace(3)* %ptr2
-  %add2 = fadd float %add1, %val2
-
-  %ptr3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
-  %val3 = load float, float addrspace(3)* %ptr3
-  %add3 = fadd float %add2, %val3
-
-  %ptr4 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12
-  %val4 = load float, float addrspace(3)* %ptr4
-  %add4 = fadd float %add3, %val4
-
-  %ptr5 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 14
-  %val5 = load float, float addrspace(3)* %ptr5
-  %add5 = fadd float %add4, %val5
-
-  %ptr6 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
-  %val6 = load float, float addrspace(3)* %ptr6
-  %add6 = fadd float %add5, %val6
-  store float %add6, float addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/ds_read2st64.ll b/test/CodeGen/R600/ds_read2st64.ll
deleted file mode 100644
index 54b3b45636d..00000000000
--- a/test/CodeGen/R600/ds_read2st64.ll
+++ /dev/null
@@ -1,272 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
-
-@lds = addrspace(3) global [512 x float] undef, align 4
-@lds.f64 = addrspace(3) global [512 x double] undef, align 8
-
-
-; SI-LABEL: @simple_read2st64_f32_0_1
-; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 64
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_read2st64_f32_1_2
-; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  %add.x.1 = add nsw i32 %x.i, 128
-  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_read2st64_f32_max_offset
-; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  %add.x.1 = add nsw i32 %x.i, 16320
-  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_read2st64_f32_over_max_offset
-; SI-NOT: ds_read2st64_b32
-; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
-; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
-; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
-; SI: s_endpgm
-define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  %add.x.1 = add nsw i32 %x.i, 16384
-  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; SI-LABEL: @odd_invalid_read2st64_f32_0
-; SI-NOT: ds_read2st64_b32
-; SI: s_endpgm
-define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 63
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; SI-LABEL: @odd_invalid_read2st64_f32_1
-; SI-NOT: ds_read2st64_b32
-; SI: s_endpgm
-define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  %add.x.1 = add nsw i32 %x.i, 127
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_read2st64_f64_0_1
-; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
-; SI: buffer_store_dwordx2 [[RESULT]]
-; SI: s_endpgm
-define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
-  %add.x = add nsw i32 %x.i, 64
-  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
-  %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
-  ret void
-}
-
-; SI-LABEL: @simple_read2st64_f64_1_2
-; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
-; SI: buffer_store_dwordx2 [[RESULT]]
-; SI: s_endpgm
-define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
-  %add.x.1 = add nsw i32 %x.i, 128
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
-  %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
-  ret void
-}
-
-; Alignment only
-
-; SI-LABEL: @misaligned_read2st64_f64
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
-; SI: s_endpgm
-define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 64
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 4
-  %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff
-; SI-LABEL: @simple_read2st64_f64_max_offset
-; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
-; SI: buffer_store_dwordx2 [[RESULT]]
-; SI: s_endpgm
-define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %add.x.0 = add nsw i32 %x.i, 256
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
-  %add.x.1 = add nsw i32 %x.i, 8128
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
-  %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
-  ret void
-}
-
-; SI-LABEL: @simple_read2st64_f64_over_max_offset
-; SI-NOT: ds_read2st64_b64
-; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
-; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
-; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
-; SI: s_endpgm
-define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
-  %add.x.1 = add nsw i32 %x.i, 8192
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
-  %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
-  ret void
-}
-
-; SI-LABEL: @invalid_read2st64_f64_odd_offset
-; SI-NOT: ds_read2st64_b64
-; SI: s_endpgm
-define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
-  %add.x.1 = add nsw i32 %x.i, 8129
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
-  %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
-  ret void
-}
-
-; The stride of 8 elements is 8 * 8 bytes. We need to make sure the
-; stride in elements, not bytes, is a multiple of 64.
-
-; SI-LABEL: @byte_size_only_divisible_64_read2_f64
-; SI-NOT: ds_read2st_b64
-; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
-; SI: s_endpgm
-define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
-  %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
-
-; Function Attrs: noduplicate nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/R600/ds_write2.ll b/test/CodeGen/R600/ds_write2.ll
deleted file mode 100644
index b553d3459e4..00000000000
--- a/test/CodeGen/R600/ds_write2.ll
+++ /dev/null
@@ -1,425 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
-
-@lds = addrspace(3) global [512 x float] undef, align 4
-@lds.f64 = addrspace(3) global [512 x double] undef, align 8
-
-
-; SI-LABEL: @simple_write2_one_val_f32
-; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
-; SI: s_endpgm
-define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
-  %val = load float, float addrspace(1)* %in.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val, float addrspace(3)* %arrayidx1, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_write2_two_val_f32
-; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 
-; SI: s_endpgm
-define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
-  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_write2_two_val_f32_volatile_0
-; SI-NOT: ds_write2_b32
-; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
-; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
-; SI: s_endpgm
-define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
-  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
-  %val0 = load float, float addrspace(1)* %in0.gep, align 4
-  %val1 = load float, float addrspace(1)* %in1.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_write2_two_val_f32_volatile_1
-; SI-NOT: ds_write2_b32
-; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
-; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
-; SI: s_endpgm
-define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
-  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
-  %val0 = load float, float addrspace(1)* %in0.gep, align 4
-  %val1 = load float, float addrspace(1)* %in1.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
-  ret void
-}
-
-; 2 data subregisters from different super registers.
-; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32
-; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
-; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
-; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
-; SI: s_endpgm
-define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
-  %val0 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
-  %val1 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
-  %val0.0 = extractelement <2 x float> %val0, i32 0
-  %val1.1 = extractelement <2 x float> %val1, i32 1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val0.0, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val1.1, float addrspace(3)* %arrayidx1, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_write2_two_val_subreg2_f32
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
-; SI: s_endpgm
-define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
-  %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
-  %val0 = extractelement <2 x float> %val, i32 0
-  %val1 = extractelement <2 x float> %val, i32 1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_write2_two_val_subreg4_f32
-; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
-; SI: s_endpgm
-define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
-  %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
-  %val0 = extractelement <4 x float> %val, i32 0
-  %val1 = extractelement <4 x float> %val, i32 3
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_write2_two_val_max_offset_f32
-; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
-; SI: s_endpgm
-define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
-  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 255
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_write2_two_val_too_far_f32
-; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
-; SI: s_endpgm
-define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
-  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
-  %val0 = load float, float addrspace(1)* %in0.gep, align 4
-  %val1 = load float, float addrspace(1)* %in1.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 257
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_write2_two_val_f32_x2
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
-; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
-; SI: s_endpgm
-define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
-  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
-  %val0 = load float, float addrspace(1)* %in0.gep, align 4
-  %val1 = load float, float addrspace(1)* %in1.gep, align 4
-
-  %idx.0 = add nsw i32 %tid.x, 0
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
-
-  %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
-
-  %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
-  store float %val0, float addrspace(3)* %arrayidx2, align 4
-
-  %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
-  store float %val1, float addrspace(3)* %arrayidx3, align 4
-
-  ret void
-}
-
-; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
-; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
-; SI: s_endpgm
-define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
-  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
-  %val0 = load float, float addrspace(1)* %in0.gep, align 4
-  %val1 = load float, float addrspace(1)* %in1.gep, align 4
-
-  %idx.0 = add nsw i32 %tid.x, 3
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
-
-  %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
-
-  %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
-  store float %val0, float addrspace(3)* %arrayidx2, align 4
-
-  %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
-  store float %val1, float addrspace(3)* %arrayidx3, align 4
-
-  ret void
-}
-
-; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32
-; SI-NOT: ds_write2_b32
-; SI: ds_write_b32
-; SI: ds_write_b32
-; SI: s_endpgm
-define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
-  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
-  %val0 = load float, float addrspace(1)* %in0.gep, align 4
-  %val1 = load float, float addrspace(1)* %in1.gep, align 4
-
-  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
-  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
-  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
-  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
-  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
-
-  ; Apply an additional offset after the vector that will be more obviously folded.
-  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
-  store float %val0, float addrspace(3)* %gep.0, align 4
-
-  %add.x = add nsw i32 %x.i, 8
-  store float %val1, float addrspace(3)* %gep.1.offset, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_write2_one_val_f64
-; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
-; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
-; SI: s_endpgm
-define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
-  %val = load double, double addrspace(1)* %in.gep, align 8
-  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  store double %val, double addrspace(3)* %arrayidx0, align 8
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  store double %val, double addrspace(3)* %arrayidx1, align 8
-  ret void
-}
-
-; SI-LABEL: @misaligned_simple_write2_one_val_f64
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
-; SI: s_endpgm
-define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
-  %val = load double, double addrspace(1)* %in.gep, align 8
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
-  store double %val, double addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 7
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
-  store double %val, double addrspace(3)* %arrayidx1, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_write2_two_val_f64
-; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
-; SI: s_endpgm
-define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
-  %val0 = load double, double addrspace(1)* %in.gep.0, align 8
-  %val1 = load double, double addrspace(1)* %in.gep.1, align 8
-  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  store double %val0, double addrspace(3)* %arrayidx0, align 8
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  store double %val1, double addrspace(3)* %arrayidx1, align 8
-  ret void
-}
-
-@foo = addrspace(3) global [4 x i32] undef, align 4
-
-; SI-LABEL: @store_constant_adjacent_offsets
-; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-define void @store_constant_adjacent_offsets() {
-  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
-  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
-  ret void
-}
-
-; SI-LABEL: @store_constant_disjoint_offsets
-; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
-; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
-define void @store_constant_disjoint_offsets() {
-  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
-  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
-  ret void
-}
-
-@bar = addrspace(3) global [4 x i64] undef, align 4
-
-; SI-LABEL: @store_misaligned64_constant_offsets
-; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
-define void @store_misaligned64_constant_offsets() {
-  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
-  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
-  ret void
-}
-
-@bar.large = addrspace(3) global [4096 x i64] undef, align 4
-
-; SI-LABEL: @store_misaligned64_constant_large_offsets
-; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
-; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
-; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; SI: s_endpgm
-define void @store_misaligned64_constant_large_offsets() {
-  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
-  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
-  ret void
-}
-
-@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
-@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
-
-define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
-  %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
-  %val = load float, float addrspace(1)* %in
-  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
-  store float %val, float addrspace(3)* %arrayidx44, align 4
-  %add47 = add nsw i32 %x.i, 1
-  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
-  store float %val, float addrspace(3)* %arrayidx48, align 4
-  %add51 = add nsw i32 %x.i, 16
-  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
-  store float %val, float addrspace(3)* %arrayidx52, align 4
-  %add55 = add nsw i32 %x.i, 17
-  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
-  store float %val, float addrspace(3)* %arrayidx56, align 4
-  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
-  store float %val, float addrspace(3)* %arrayidx60, align 4
-  %add63 = add nsw i32 %y.i, 1
-  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
-  store float %val, float addrspace(3)* %arrayidx64, align 4
-  %add67 = add nsw i32 %y.i, 32
-  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
-  store float %val, float addrspace(3)* %arrayidx68, align 4
-  %add71 = add nsw i32 %y.i, 33
-  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
-  store float %val, float addrspace(3)* %arrayidx72, align 4
-  %add75 = add nsw i32 %y.i, 64
-  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
-  store float %val, float addrspace(3)* %arrayidx76, align 4
-  %add79 = add nsw i32 %y.i, 65
-  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
-  store float %val, float addrspace(3)* %arrayidx80, align 4
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
-
-; Function Attrs: noduplicate nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/R600/ds_write2st64.ll b/test/CodeGen/R600/ds_write2st64.ll
deleted file mode 100644
index 1d9d881c5c7..00000000000
--- a/test/CodeGen/R600/ds_write2st64.ll
+++ /dev/null
@@ -1,119 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
-
-
-@lds = addrspace(3) global [512 x float] undef, align 4
-
-
-; SI-LABEL: @simple_write2st64_one_val_f32_0_1
-; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
-; SI: s_endpgm
-define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
-  %val = load float, float addrspace(1)* %in.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 64
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val, float addrspace(3)* %arrayidx1, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_write2st64_two_val_f32_2_5
-; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
-; SI: s_endpgm
-define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
-  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
-  %add.x.0 = add nsw i32 %x.i, 128
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
-  %add.x.1 = add nsw i32 %x.i, 320
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_write2st64_two_val_max_offset_f32
-; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
-; SI: s_endpgm
-define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
-  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
-  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
-  %add.x = add nsw i32 %x.i, 16320
-  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
-  ret void
-}
-
-; SI-LABEL: @simple_write2st64_two_val_max_offset_f64
-; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
-; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
-; SI: s_endpgm
-define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
-  %val0 = load double, double addrspace(1)* %in.gep.0, align 8
-  %val1 = load double, double addrspace(1)* %in.gep.1, align 8
-  %add.x.0 = add nsw i32 %x.i, 256
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
-  store double %val0, double addrspace(3)* %arrayidx0, align 8
-  %add.x.1 = add nsw i32 %x.i, 8128
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
-  store double %val1, double addrspace(3)* %arrayidx1, align 8
-  ret void
-}
-
-; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64
-; SI-NOT: ds_write2st64_b64
-; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
-; SI: s_endpgm
-define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
-  %val = load double, double addrspace(1)* %in.gep, align 8
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
-  store double %val, double addrspace(3)* %arrayidx0, align 8
-  %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
-  store double %val, double addrspace(3)* %arrayidx1, align 8
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
-
-; Function Attrs: noduplicate nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/R600/elf.ll b/test/CodeGen/R600/elf.ll
deleted file mode 100644
index d0fd06a3437..00000000000
--- a/test/CodeGen/R600/elf.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s
-; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
-
-; Test that we don't try to produce a COFF file on windows
-; RUN: llc < %s -mtriple=amdgcn-pc-mingw -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
-
-; ELF: Format: ELF32
-; ELF: Name: .AMDGPU.config
-; ELF: Type: SHT_PROGBITS
-
-; ELF: Symbol {
-; ELF: Name: test
-; ELF: Binding: Global
-
-; CONFIG: .section .AMDGPU.config
-; CONFIG-NEXT: .long   45096
-; TYPICAL-NEXT: .long   0
-; TONGA-NEXT: .long   576
-; CONFIG: .align 256
-; CONFIG: test:
-define void @test(i32 %p) #0 {
-   %i = add i32 %p, 2
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" } ; Pixel Shader
diff --git a/test/CodeGen/R600/elf.r600.ll b/test/CodeGen/R600/elf.r600.ll
deleted file mode 100644
index 51cd0850093..00000000000
--- a/test/CodeGen/R600/elf.r600.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood -filetype=obj | llvm-readobj -s - | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -march=r600 -mcpu=redwood -o - | FileCheck --check-prefix=CONFIG %s
-
-; ELF: Format: ELF32
-; ELF: Name: .AMDGPU.config
-
-; CONFIG: .section .AMDGPU.config
-; CONFIG-NEXT: .long   166100
-; CONFIG-NEXT: .long   2
-; CONFIG-NEXT: .long   165900
-; CONFIG-NEXT: .long   0
-define void @test(float addrspace(1)* %out, i32 %p) {
-   %i = add i32 %p, 2
-   %r = bitcast i32 %i to float
-   store float %r, float addrspace(1)* %out
-   ret void
-}
diff --git a/test/CodeGen/R600/empty-function.ll b/test/CodeGen/R600/empty-function.ll
deleted file mode 100644
index a060900811e..00000000000
--- a/test/CodeGen/R600/empty-function.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; Make sure we don't assert on empty functions
-
-; SI: .text
-; SI-LABEL: {{^}}empty_function_ret:
-; SI: s_endpgm
-; SI: codeLenInByte = 4
-define void @empty_function_ret() #0 {
-  ret void
-}
-
-; SI: .text
-; SI-LABEL: {{^}}empty_function_unreachable:
-; SI: codeLenInByte = 0
-define void @empty_function_unreachable() #0 {
-  unreachable
-}
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/R600/endcf-loop-header.ll b/test/CodeGen/R600/endcf-loop-header.ll
deleted file mode 100644
index 267a323c506..00000000000
--- a/test/CodeGen/R600/endcf-loop-header.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-
-; This tests that the llvm.SI.end.cf intrinsic is not inserted into the
-; loop block.  This intrinsic will be lowered to s_or_b64 by the code
-; generator.
-
-; CHECK-LABEL: {{^}}test:
-
-; This is was lowered from the llvm.SI.end.cf intrinsic:
-; CHECK: s_or_b64 exec, exec
-
-; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}}
-; CHECK-NOT: s_or_b64 exec, exec
-; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
-define void @test(i32 addrspace(1)* %out, i32 %cond) {
-entry:
-  %tmp0 = icmp eq i32 %cond, 0
-  br i1 %tmp0, label %if, label %loop
-
-if:
-  store i32 0, i32 addrspace(1)* %out
-  br label %loop
-
-loop:
-  %tmp1 = phi i32 [0, %entry], [0, %if], [%inc, %loop]
-  %inc = add i32 %tmp1, %cond
-  %tmp2 = icmp ugt i32 %inc, 10
-  br i1 %tmp2, label %done, label %loop
-
-done:
-  %tmp3 = getelementptr i32, i32 addrspace(1)* %out, i64 1
-  store i32 %inc, i32 addrspace(1)* %tmp3
-  ret void
-}
diff --git a/test/CodeGen/R600/extload-private.ll b/test/CodeGen/R600/extload-private.ll
deleted file mode 100644
index 294c3a9c678..00000000000
--- a/test/CodeGen/R600/extload-private.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}load_i8_sext_private:
-; SI: buffer_load_sbyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
-define void @load_i8_sext_private(i32 addrspace(1)* %out) {
-entry:
-  %tmp0 = alloca i8
-  %tmp1 = load i8, i8* %tmp0
-  %tmp2 = sext i8 %tmp1 to i32
-  store i32 %tmp2, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i8_zext_private:
-; SI: buffer_load_ubyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
-define void @load_i8_zext_private(i32 addrspace(1)* %out) {
-entry:
-  %tmp0 = alloca i8
-  %tmp1 = load i8, i8* %tmp0
-  %tmp2 = zext i8 %tmp1 to i32
-  store i32 %tmp2, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i16_sext_private:
-; SI: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
-define void @load_i16_sext_private(i32 addrspace(1)* %out) {
-entry:
-  %tmp0 = alloca i16
-  %tmp1 = load i16, i16* %tmp0
-  %tmp2 = sext i16 %tmp1 to i32
-  store i32 %tmp2, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i16_zext_private:
-; SI: buffer_load_ushort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
-define void @load_i16_zext_private(i32 addrspace(1)* %out) {
-entry:
-  %tmp0 = alloca i16
-  %tmp1 = load i16, i16* %tmp0
-  %tmp2 = zext i16 %tmp1 to i32
-  store i32 %tmp2, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll
deleted file mode 100644
index 662eb7a9716..00000000000
--- a/test/CodeGen/R600/extload.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}anyext_load_i8:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
-; EG: VTX_READ_32 [[VAL]]
-
-define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
-  %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
-  %load = load i32, i32 addrspace(1)* %cast, align 1
-  %x = bitcast i32 %load to <4 x i8>
-  %castOut = bitcast i8 addrspace(1)* %out to <4 x i8> addrspace(1)*
-  store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut, align 1
-  ret void
-}
-
-; FUNC-LABEL: {{^}}anyext_load_i16:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
-; EG: VTX_READ_32 [[VAL]]
-
-define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
-  %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
-  %load = load i32, i32 addrspace(1)* %cast, align 1
-  %x = bitcast i32 %load to <2 x i16>
-  %castOut = bitcast i16 addrspace(1)* %out to <2 x i16> addrspace(1)*
-  store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut, align 1
-  ret void
-}
-
-; FUNC-LABEL: {{^}}anyext_load_lds_i8:
-; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
-; EG: LDS_WRITE * [[VAL]]
-define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
-  %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
-  %load = load i32, i32 addrspace(3)* %cast, align 1
-  %x = bitcast i32 %load to <4 x i8>
-  %castOut = bitcast i8 addrspace(3)* %out to <4 x i8> addrspace(3)*
-  store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut, align 1
-  ret void
-}
-
-; FUNC-LABEL: {{^}}anyext_load_lds_i16:
-; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
-; EG: LDS_WRITE * [[VAL]]
-define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
-  %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
-  %load = load i32, i32 addrspace(3)* %cast, align 1
-  %x = bitcast i32 %load to <2 x i16>
-  %castOut = bitcast i16 addrspace(3)* %out to <2 x i16> addrspace(3)*
-  store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut, align 1
-  ret void
-}
diff --git a/test/CodeGen/R600/extract_vector_elt_i16.ll b/test/CodeGen/R600/extract_vector_elt_i16.ll
deleted file mode 100644
index c7572efc6f5..00000000000
--- a/test/CodeGen/R600/extract_vector_elt_i16.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}extract_vector_elt_v2i16:
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_store_short
-; SI: buffer_store_short
-define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind {
-  %p0 = extractelement <2 x i16> %foo, i32 0
-  %p1 = extractelement <2 x i16> %foo, i32 1
-  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
-  store i16 %p1, i16 addrspace(1)* %out, align 2
-  store i16 %p0, i16 addrspace(1)* %out1, align 2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}extract_vector_elt_v4i16:
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_store_short
-; SI: buffer_store_short
-define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind {
-  %p0 = extractelement <4 x i16> %foo, i32 0
-  %p1 = extractelement <4 x i16> %foo, i32 2
-  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
-  store i16 %p1, i16 addrspace(1)* %out, align 2
-  store i16 %p0, i16 addrspace(1)* %out1, align 2
-  ret void
-}
diff --git a/test/CodeGen/R600/fabs.f64.ll b/test/CodeGen/R600/fabs.f64.ll
deleted file mode 100644
index 3c6136c1a7b..00000000000
--- a/test/CodeGen/R600/fabs.f64.ll
+++ /dev/null
@@ -1,97 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-declare double @fabs(double) readnone
-declare double @llvm.fabs.f64(double) readnone
-declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone
-declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
-
-; FUNC-LABEL: {{^}}v_fabs_f64:
-; SI: v_and_b32
-; SI: s_endpgm
-define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %tidext = sext i32 %tid to i64
-  %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext
-  %val = load double, double addrspace(1)* %gep, align 8
-  %fabs = call double @llvm.fabs.f64(double %val)
-  store double %fabs, double addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fabs_f64:
-; SI: v_and_b32
-; SI-NOT: v_and_b32
-; SI: s_endpgm
-define void @fabs_f64(double addrspace(1)* %out, double %in) {
-  %fabs = call double @llvm.fabs.f64(double %in)
-  store double %fabs, double addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fabs_v2f64:
-; SI: v_and_b32
-; SI: v_and_b32
-; SI: s_endpgm
-define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
-  %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
-  store <2 x double> %fabs, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fabs_v4f64:
-; SI: v_and_b32
-; SI: v_and_b32
-; SI: v_and_b32
-; SI: v_and_b32
-; SI: s_endpgm
-define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
-  %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
-  store <4 x double> %fabs, <4 x double> addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}fabs_fold_f64:
-; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-NOT: and
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
-; SI: s_endpgm
-define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
-  %fabs = call double @llvm.fabs.f64(double %in0)
-  %fmul = fmul double %fabs, %in1
-  store double %fmul, double addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}fabs_fn_fold_f64:
-; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-NOT: and
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
-; SI: s_endpgm
-define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
-  %fabs = call double @fabs(double %in0)
-  %fmul = fmul double %fabs, %in1
-  store double %fmul, double addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fabs_free_f64:
-; SI: v_and_b32
-; SI: s_endpgm
-define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
-  %bc= bitcast i64 %in to double
-  %fabs = call double @llvm.fabs.f64(double %bc)
-  store double %fabs, double addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fabs_fn_free_f64:
-; SI: v_and_b32
-; SI: s_endpgm
-define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
-  %bc= bitcast i64 %in to double
-  %fabs = call double @fabs(double %bc)
-  store double %fabs, double addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
deleted file mode 100644
index 419a73d0266..00000000000
--- a/test/CodeGen/R600/fabs.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-
-; DAGCombiner will transform:
-; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
-; unless isFabsFree returns true
-
-; FUNC-LABEL: {{^}}fabs_fn_free:
-; R600-NOT: AND
-; R600: |PV.{{[XYZW]}}|
-
-; GCN: v_and_b32
-
-define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
-  %bc= bitcast i32 %in to float
-  %fabs = call float @fabs(float %bc)
-  store float %fabs, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fabs_free:
-; R600-NOT: AND
-; R600: |PV.{{[XYZW]}}|
-
-; GCN: v_and_b32
-
-define void @fabs_free(float addrspace(1)* %out, i32 %in) {
-  %bc= bitcast i32 %in to float
-  %fabs = call float @llvm.fabs.f32(float %bc)
-  store float %fabs, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fabs_f32:
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-
-; GCN: v_and_b32
-define void @fabs_f32(float addrspace(1)* %out, float %in) {
-  %fabs = call float @llvm.fabs.f32(float %in)
-  store float %fabs, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fabs_v2f32:
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-
-; GCN: v_and_b32
-; GCN: v_and_b32
-define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
-  store <2 x float> %fabs, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fabs_v4f32:
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-
-; GCN: v_and_b32
-; GCN: v_and_b32
-; GCN: v_and_b32
-; GCN: v_and_b32
-define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-  %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
-  store <4 x float> %fabs, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}fabs_fn_fold:
-; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
-; GCN-NOT: and
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
-define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
-  %fabs = call float @fabs(float %in0)
-  %fmul = fmul float %fabs, %in1
-  store float %fmul, float addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}fabs_fold:
-; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
-; GCN-NOT: and
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
-define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
-  %fabs = call float @llvm.fabs.f32(float %in0)
-  %fmul = fmul float %fabs, %in1
-  store float %fmul, float addrspace(1)* %out
-  ret void
-}
-
-declare float @fabs(float) readnone
-declare float @llvm.fabs.f32(float) readnone
-declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
-declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
deleted file mode 100644
index 5fac328c598..00000000000
--- a/test/CodeGen/R600/fadd.ll
+++ /dev/null
@@ -1,64 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-
-; FUNC-LABEL: {{^}}fadd_f32:
-; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI: v_add_f32
-define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) {
-   %add = fadd float %a, %b
-   store float %add, float addrspace(1)* %out, align 4
-   ret void
-}
-
-; FUNC-LABEL: {{^}}fadd_v2f32:
-; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
-; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
-; SI: v_add_f32
-; SI: v_add_f32
-define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
-  %add = fadd <2 x float> %a, %b
-  store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fadd_v4f32:
-; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
-  %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
-  %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
-  %result = fadd <4 x float> %a, %b
-  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fadd_v8f32:
-; R600: ADD
-; R600: ADD
-; R600: ADD
-; R600: ADD
-; R600: ADD
-; R600: ADD
-; R600: ADD
-; R600: ADD
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) {
-  %add = fadd <8 x float> %a, %b
-  store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32
-  ret void
-}
diff --git a/test/CodeGen/R600/fadd64.ll b/test/CodeGen/R600/fadd64.ll
deleted file mode 100644
index 485c55870c4..00000000000
--- a/test/CodeGen/R600/fadd64.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; CHECK: {{^}}fadd_f64:
-; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
-
-define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                      double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
-   %r2 = fadd double %r0, %r1
-   store double %r2, double addrspace(1)* %out
-   ret void
-}
diff --git a/test/CodeGen/R600/fceil.ll b/test/CodeGen/R600/fceil.ll
deleted file mode 100644
index f23e8919d73..00000000000
--- a/test/CodeGen/R600/fceil.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.ceil.f32(float) nounwind readnone
-declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone
-declare <3 x float> @llvm.ceil.v3f32(<3 x float>) nounwind readnone
-declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
-declare <8 x float> @llvm.ceil.v8f32(<8 x float>) nounwind readnone
-declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone
-
-; FUNC-LABEL: {{^}}fceil_f32:
-; SI: v_ceil_f32_e32
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
-; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_f32(float addrspace(1)* %out, float %x) {
-  %y = call float @llvm.ceil.f32(float %x) nounwind readnone
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fceil_v2f32:
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
-; EG: CEIL {{\*? *}}[[RESULT]]
-; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
-  %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone
-  store <2 x float> %y, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fceil_v3f32:
-; FIXME-SI: v_ceil_f32_e32
-; FIXME-SI: v_ceil_f32_e32
-; FIXME-SI: v_ceil_f32_e32
-; FIXME-EG: v3 is treated as v2 and v1, hence 2 stores
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
-; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
-  %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone
-  store <3 x float> %y, <3 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fceil_v4f32:
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
-; EG: CEIL {{\*? *}}[[RESULT]]
-; EG: CEIL {{\*? *}}[[RESULT]]
-; EG: CEIL {{\*? *}}[[RESULT]]
-; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
-  %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone
-  store <4 x float> %y, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fceil_v8f32:
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
-; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
-  %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone
-  store <8 x float> %y, <8 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fceil_v16f32:
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; SI: v_ceil_f32_e32
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT3:T[0-9]+]]{{\.[XYZW]}}
-; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT4:T[0-9]+]]{{\.[XYZW]}}
-; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
-; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
-define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
-  %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone
-  store <16 x float> %y, <16 x float> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fceil64.ll b/test/CodeGen/R600/fceil64.ll
deleted file mode 100644
index e8c34f0141e..00000000000
--- a/test/CodeGen/R600/fceil64.ll
+++ /dev/null
@@ -1,105 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-
-declare double @llvm.ceil.f64(double) nounwind readnone
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
-declare <3 x double> @llvm.ceil.v3f64(<3 x double>) nounwind readnone
-declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone
-declare <8 x double> @llvm.ceil.v8f64(<8 x double>) nounwind readnone
-declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
-
-; FUNC-LABEL: {{^}}fceil_f64:
-; CI: v_ceil_f64_e32
-; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
-; SI: s_lshr_b64
-; SI: s_not_b64
-; SI: s_and_b64
-; SI: cmp_gt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI: cmp_lt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI-DAG: v_cmp_lt_f64
-; SI-DAG: v_cmp_lg_f64
-; SI: s_and_b64
-; SI: v_cndmask_b32
-; SI: v_cndmask_b32
-; SI: v_add_f64
-; SI: s_endpgm
-define void @fceil_f64(double addrspace(1)* %out, double %x) {
-  %y = call double @llvm.ceil.f64(double %x) nounwind readnone
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fceil_v2f64:
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
-  %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
-  store <2 x double> %y, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; FIXME-FUNC-LABEL: {{^}}fceil_v3f64:
-; FIXME-CI: v_ceil_f64_e32
-; FIXME-CI: v_ceil_f64_e32
-; FIXME-CI: v_ceil_f64_e32
-; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
-;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
-;   store <3 x double> %y, <3 x double> addrspace(1)* %out
-;   ret void
-; }
-
-; FUNC-LABEL: {{^}}fceil_v4f64:
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
-  %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
-  store <4 x double> %y, <4 x double> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fceil_v8f64:
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
-  %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
-  store <8 x double> %y, <8 x double> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fceil_v16f64:
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-; CI: v_ceil_f64_e32
-define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
-  %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
-  store <16 x double> %y, <16 x double> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fcmp-cnd.ll b/test/CodeGen/R600/fcmp-cnd.ll
deleted file mode 100644
index 530274f920f..00000000000
--- a/test/CodeGen/R600/fcmp-cnd.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;Not checking arguments 2 and 3 to CNDE, because they may change between
-;registers and literal.x depending on what the optimizer does.
-;CHECK: CNDE  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
-entry:
-  %0 = load float, float addrspace(1)* %in
-  %cmp = fcmp oeq float %0, 0.000000e+00
-  %value = select i1 %cmp, i32 2, i32 3 
-  store i32 %value, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fcmp-cnde-int-args.ll b/test/CodeGen/R600/fcmp-cnde-int-args.ll
deleted file mode 100644
index c402805feb3..00000000000
--- a/test/CodeGen/R600/fcmp-cnde-int-args.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; This test checks a bug in R600TargetLowering::LowerSELECT_CC where the
-; chance to optimize the fcmp + select instructions to SET* was missed
-; due to the fact that the operands to fcmp and select had different types
-
-; CHECK: SET{{[A-Z]+}}_DX10
-
-define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
-entry:
-  %0 = load float, float addrspace(1)* %in
-  %cmp = fcmp oeq float %0, 0.000000e+00
-  %value = select i1 %cmp, i32 -1, i32 0
-  store i32 %value, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fcmp.ll b/test/CodeGen/R600/fcmp.ll
deleted file mode 100644
index 5207ab57bad..00000000000
--- a/test/CodeGen/R600/fcmp.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; CHECK: {{^}}fcmp_sext:
-; CHECK: SETE_DX10  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
-entry:
-  %0 = load float, float addrspace(1)* %in
-  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1
-  %1 = load float, float addrspace(1)* %arrayidx1
-  %cmp = fcmp oeq float %0, %1
-  %sext = sext i1 %cmp to i32
-  store i32 %sext, i32 addrspace(1)* %out
-  ret void
-}
-
-; This test checks that a setcc node with f32 operands is lowered to a
-; SET*_DX10 instruction.  Previously we were lowering this to:
-; SET* + FP_TO_SINT
-
-; CHECK: {{^}}fcmp_br:
-; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}}
-; CHECK-NEXT {{[0-9]+(5.0}}
-
-define void @fcmp_br(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp oeq float %in, 5.0
-  br i1 %0, label %IF, label %ENDIF
-
-IF:
-  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  store i32 0, i32 addrspace(1)* %1
-  br label %ENDIF
-
-ENDIF:
-  store i32 0, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fcmp64.ll b/test/CodeGen/R600/fcmp64.ll
deleted file mode 100644
index 053ab0ed7aa..00000000000
--- a/test/CodeGen/R600/fcmp64.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; CHECK-LABEL: {{^}}flt_f64:
-; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
-   %r2 = fcmp ult double %r0, %r1
-   %r3 = zext i1 %r2 to i32
-   store i32 %r3, i32 addrspace(1)* %out
-   ret void
-}
-
-; CHECK-LABEL: {{^}}fle_f64:
-; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
-   %r2 = fcmp ule double %r0, %r1
-   %r3 = zext i1 %r2 to i32
-   store i32 %r3, i32 addrspace(1)* %out
-   ret void
-}
-
-; CHECK-LABEL: {{^}}fgt_f64:
-; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
-   %r2 = fcmp ugt double %r0, %r1
-   %r3 = zext i1 %r2 to i32
-   store i32 %r3, i32 addrspace(1)* %out
-   ret void
-}
-
-; CHECK-LABEL: {{^}}fge_f64:
-; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
-   %r2 = fcmp uge double %r0, %r1
-   %r3 = zext i1 %r2 to i32
-   store i32 %r3, i32 addrspace(1)* %out
-   ret void
-}
-
-; CHECK-LABEL: {{^}}fne_f64:
-; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
-   %r2 = fcmp une double %r0, %r1
-   %r3 = select i1 %r2, double %r0, double %r1
-   store double %r3, double addrspace(1)* %out
-   ret void
-}
-
-; CHECK-LABEL: {{^}}feq_f64:
-; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
-   %r2 = fcmp ueq double %r0, %r1
-   %r3 = select i1 %r2, double %r0, double %r1
-   store double %r3, double addrspace(1)* %out
-   ret void
-}
diff --git a/test/CodeGen/R600/fconst64.ll b/test/CodeGen/R600/fconst64.ll
deleted file mode 100644
index 89af37545c9..00000000000
--- a/test/CodeGen/R600/fconst64.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; CHECK: {{^}}fconst_f64:
-; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000
-; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
-
-define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-   %r1 = load double, double addrspace(1)* %in
-   %r2 = fadd double %r1, 5.000000e+00
-   store double %r2, double addrspace(1)* %out
-   ret void
-}
diff --git a/test/CodeGen/R600/fcopysign.f32.ll b/test/CodeGen/R600/fcopysign.f32.ll
deleted file mode 100644
index b719d5a3978..00000000000
--- a/test/CodeGen/R600/fcopysign.f32.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-declare float @llvm.copysign.f32(float, float) nounwind readnone
-declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind readnone
-
-; Try to identify arg based on higher address.
-; FUNC-LABEL: {{^}}test_copysign_f32:
-; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb
-; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc
-; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c
-; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30
-; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
-; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
-; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
-; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-
-; EG: BFI_INT
-define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
-  %result = call float @llvm.copysign.f32(float %mag, float %sign)
-  store float %result, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_copysign_v2f32:
-; GCN: s_endpgm
-
-; EG: BFI_INT
-; EG: BFI_INT
-define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
-  %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
-  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_copysign_v4f32:
-; GCN: s_endpgm
-
-; EG: BFI_INT
-; EG: BFI_INT
-; EG: BFI_INT
-; EG: BFI_INT
-define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
-  %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
-  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
diff --git a/test/CodeGen/R600/fcopysign.f64.ll b/test/CodeGen/R600/fcopysign.f64.ll
deleted file mode 100644
index 3d8c5599308..00000000000
--- a/test/CodeGen/R600/fcopysign.f64.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
-
-declare double @llvm.copysign.f64(double, double) nounwind readnone
-declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind readnone
-declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
-
-; FUNC-LABEL: {{^}}test_copysign_f64:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
-; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
-; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
-; GCN: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
-; GCN: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
-; GCN: s_endpgm
-define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
-  %result = call double @llvm.copysign.f64(double %mag, double %sign)
-  store double %result, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_copysign_v2f64:
-; GCN: s_endpgm
-define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
-  %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
-  store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_copysign_v4f64:
-; GCN: s_endpgm
-define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
-  %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
-  store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/fdiv.f64.ll b/test/CodeGen/R600/fdiv.f64.ll
deleted file mode 100644
index 7c022e38c80..00000000000
--- a/test/CodeGen/R600/fdiv.f64.ll
+++ /dev/null
@@ -1,96 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
-
-
-; COMMON-LABEL: {{^}}fdiv_f64:
-; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0
-; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]]
-; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]]
-
-; Check for div_scale bug workaround on SI
-; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]]
-; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[NUM]], [[DEN]], [[NUM]]
-
-; COMMON-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]]
-
-; SI-DAG: v_cmp_eq_i32_e32 vcc, {{v[0-9]+}}, {{v[0-9]+}}
-; SI-DAG: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}
-; SI-DAG: s_xor_b64 vcc, [[CMP0]], vcc
-
-; COMMON-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0
-; COMMON-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]]
-; COMMON-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0
-; COMMON-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]]
-; COMMON-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]]
-; COMMON-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]]
-; COMMON: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]]
-; COMMON: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]]
-; COMMON: buffer_store_dwordx2 [[RESULT]]
-; COMMON: s_endpgm
-define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind {
-  %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1
-  %num = load double, double addrspace(1)* %in
-  %den = load double, double addrspace(1)* %gep.1
-  %result = fdiv double %num, %den
-  store double %result, double addrspace(1)* %out
-  ret void
-}
-
-; COMMON-LABEL: {{^}}fdiv_f64_s_v:
-define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) nounwind {
-  %den = load double, double addrspace(1)* %in
-  %result = fdiv double %num, %den
-  store double %result, double addrspace(1)* %out
-  ret void
-}
-
-; COMMON-LABEL: {{^}}fdiv_f64_v_s:
-define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) nounwind {
-  %num = load double, double addrspace(1)* %in
-  %result = fdiv double %num, %den
-  store double %result, double addrspace(1)* %out
-  ret void
-}
-
-; COMMON-LABEL: {{^}}fdiv_f64_s_s:
-define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) nounwind {
-  %result = fdiv double %num, %den
-  store double %result, double addrspace(1)* %out
-  ret void
-}
-
-; COMMON-LABEL: {{^}}v_fdiv_v2f64:
-define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) nounwind {
-  %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1
-  %num = load <2 x double>, <2 x double> addrspace(1)* %in
-  %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1
-  %result = fdiv <2 x double> %num, %den
-  store <2 x double> %result, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; COMMON-LABEL: {{^}}s_fdiv_v2f64:
-define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) {
-  %result = fdiv <2 x double> %num, %den
-  store <2 x double> %result, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; COMMON-LABEL: {{^}}v_fdiv_v4f64:
-define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) nounwind {
-  %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
-  %num = load <4 x double>, <4 x double> addrspace(1)* %in
-  %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1
-  %result = fdiv <4 x double> %num, %den
-  store <4 x double> %result, <4 x double> addrspace(1)* %out
-  ret void
-}
-
-; COMMON-LABEL: {{^}}s_fdiv_v4f64:
-define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) {
-  %result = fdiv <4 x double> %num, %den
-  store <4 x double> %result, <4 x double> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
deleted file mode 100644
index 7cbf8733639..00000000000
--- a/test/CodeGen/R600/fdiv.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; These tests check that fdiv is expanded correctly and also test that the
-; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
-; instruction groups.
-
-; FUNC-LABEL: {{^}}fdiv_f32:
-; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
-; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fdiv float %a, %b
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-
-
-; FUNC-LABEL: {{^}}fdiv_v2f32:
-; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
-; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
-entry:
-  %0 = fdiv <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fdiv_v4f32:
-; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
-  %a = load <4 x float>, <4 x float> addrspace(1) * %in
-  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
-  %result = fdiv <4 x float> %a, %b
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fetch-limits.r600.ll b/test/CodeGen/R600/fetch-limits.r600.ll
deleted file mode 100644
index e7160ef5d72..00000000000
--- a/test/CodeGen/R600/fetch-limits.r600.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=r600 | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=rs880 | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=rv670 | FileCheck %s
-
-; R600 supports 8 fetches in a clause
-; CHECK: {{^}}fetch_limits_r600:
-; CHECK: Fetch clause
-; CHECK: Fetch clause
-
-define void @fetch_limits_r600() #0 {
-entry:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* null
-  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
-  %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
-  %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
-  %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1)
-  %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1)
-  %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1)
-  %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1)
-  %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1)
-  %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
-  %a = fadd <4 x float> %res0, %res1
-  %b = fadd <4 x float> %res2, %res3
-  %c = fadd <4 x float> %res4, %res5
-  %d = fadd <4 x float> %res6, %res7
-  %e = fadd <4 x float> %res8, %a
-
-  %bc = fadd <4 x float> %b, %c
-  %de = fadd <4 x float> %d, %e
-
-  %bcde = fadd <4 x float> %bc, %de
-
-  call void @llvm.R600.store.swizzle(<4 x float> %bcde, i32 0, i32 1)
-  ret void
-}
-
-attributes #0 = { "ShaderType"="0" } ; Pixel Shader
-
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/R600/fetch-limits.r700+.ll b/test/CodeGen/R600/fetch-limits.r700+.ll
deleted file mode 100644
index acaea2aa794..00000000000
--- a/test/CodeGen/R600/fetch-limits.r700+.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=rv710 | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=rv730 | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=rv770 | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=sumo | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=juniper | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=barts | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=turks | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=caicos | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
-
-; r700+ supports 16 fetches in a clause
-; CHECK: {{^}}fetch_limits_r700:
-; CHECK: Fetch clause
-; CHECK: Fetch clause
-
-define void @fetch_limits_r700() #0 {
-entry:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* null
-  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %9 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %11 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
-  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
-  %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13)
-  %14 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
-  %15 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
-  %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
-  %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
-  %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
-  %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
-  %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1)
-  %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1)
-  %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1)
-  %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1)
-  %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1)
-  %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
-  %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %9, i32 0, i32 0, i32 1)
-  %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %10, i32 0, i32 0, i32 1)
-  %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %11, i32 0, i32 0, i32 1)
-  %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %12, i32 0, i32 0, i32 1)
-  %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %13, i32 0, i32 0, i32 1)
-  %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %14, i32 0, i32 0, i32 1)
-  %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %15, i32 0, i32 0, i32 1)
-  %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %16, i32 0, i32 0, i32 1)
-  %a = fadd <4 x float> %res0, %res1
-  %b = fadd <4 x float> %res2, %res3
-  %c = fadd <4 x float> %res4, %res5
-  %d = fadd <4 x float> %res6, %res7
-  %e = fadd <4 x float> %res8, %res9
-  %f = fadd <4 x float> %res10, %res11
-  %g = fadd <4 x float> %res12, %res13
-  %h = fadd <4 x float> %res14, %res15
-  %i = fadd <4 x float> %res16, %a
-
-  %bc = fadd <4 x float> %b, %c
-  %de = fadd <4 x float> %d, %e
-  %fg = fadd <4 x float> %f, %g
-  %hi = fadd <4 x float> %h, %i
-
-  %bcde = fadd <4 x float> %bc, %de
-  %fghi = fadd <4 x float> %fg, %hi
-
-  %bcdefghi = fadd <4 x float> %bcde, %fghi
-  call void @llvm.R600.store.swizzle(<4 x float> %bcdefghi, i32 0, i32 1)
-  ret void
-}
-
-attributes #0 = { "ShaderType"="0" } ; Pixel Shader
-
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/R600/ffloor.f64.ll b/test/CodeGen/R600/ffloor.f64.ll
deleted file mode 100644
index 45f8382c392..00000000000
--- a/test/CodeGen/R600/ffloor.f64.ll
+++ /dev/null
@@ -1,127 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-
-declare double @llvm.fabs.f64(double %Val)
-declare double @llvm.floor.f64(double) nounwind readnone
-declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
-declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone
-declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone
-declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone
-declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
-
-; FUNC-LABEL: {{^}}ffloor_f64:
-; CI: v_floor_f64_e32
-; SI: v_fract_f64_e32
-; SI: v_min_f64
-; SI: v_cmp_class_f64_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_add_f64
-; SI: s_endpgm
-define void @ffloor_f64(double addrspace(1)* %out, double %x) {
-  %y = call double @llvm.floor.f64(double %x) nounwind readnone
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}ffloor_f64_neg:
-; CI: v_floor_f64_e64
-; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT:s[[0-9]+:[0-9]+]]]
-; SI: v_min_f64
-; SI: v_cmp_class_f64_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]]
-; SI: s_endpgm
-define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
-  %neg = fsub double 0.0, %x
-  %y = call double @llvm.floor.f64(double %neg) nounwind readnone
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}ffloor_f64_neg_abs:
-; CI: v_floor_f64_e64
-; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT:s[[0-9]+:[0-9]+]]]|
-; SI: v_min_f64
-; SI: v_cmp_class_f64_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]|
-; SI: s_endpgm
-define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
-  %abs = call double @llvm.fabs.f64(double %x)
-  %neg = fsub double 0.0, %abs
-  %y = call double @llvm.floor.f64(double %neg) nounwind readnone
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}ffloor_v2f64:
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
-  %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
-  store <2 x double> %y, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64:
-; FIXME-CI: v_floor_f64_e32
-; FIXME-CI: v_floor_f64_e32
-; FIXME-CI: v_floor_f64_e32
-; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
-;   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
-;   store <3 x double> %y, <3 x double> addrspace(1)* %out
-;   ret void
-; }
-
-; FUNC-LABEL: {{^}}ffloor_v4f64:
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
-  %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
-  store <4 x double> %y, <4 x double> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}ffloor_v8f64:
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
-  %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
-  store <8 x double> %y, <8 x double> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}ffloor_v16f64:
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
-  %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
-  store <16 x double> %y, <16 x double> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/ffloor.ll b/test/CodeGen/R600/ffloor.ll
deleted file mode 100644
index 61c46ac2bc0..00000000000
--- a/test/CodeGen/R600/ffloor.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}floor_f32:
-; SI: v_floor_f32_e32
-; R600: FLOOR
-define void @floor_f32(float addrspace(1)* %out, float %in) {
-  %tmp = call float @llvm.floor.f32(float %in) #0
-  store float %tmp, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}floor_v2f32:
-; SI: v_floor_f32_e32
-; SI: v_floor_f32_e32
-
-define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-  %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0
-  store <2 x float> %tmp, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}floor_v4f32:
-; SI: v_floor_f32_e32
-; SI: v_floor_f32_e32
-; SI: v_floor_f32_e32
-; SI: v_floor_f32_e32
-
-; R600: FLOOR
-; R600: FLOOR
-; R600: FLOOR
-; R600: FLOOR
-define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-  %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0
-  store <4 x float> %tmp, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; Function Attrs: nounwind readonly
-declare float @llvm.floor.f32(float) #0
-
-; Function Attrs: nounwind readonly
-declare <2 x float> @llvm.floor.v2f32(<2 x float>) #0
-
-; Function Attrs: nounwind readonly
-declare <4 x float> @llvm.floor.v4f32(<4 x float>) #0
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/flat-address-space.ll b/test/CodeGen/R600/flat-address-space.ll
deleted file mode 100644
index 8ceca078f2d..00000000000
--- a/test/CodeGen/R600/flat-address-space.ll
+++ /dev/null
@@ -1,184 +0,0 @@
-; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
-; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
-
-; Disable optimizations in case there are optimizations added that
-; specialize away generic pointer accesses.
-
-
-; CHECK-LABEL: {{^}}branch_use_flat_i32:
-; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; CHECK: s_endpgm
-define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
-entry:
-  %cmp = icmp ne i32 %c, 0
-  br i1 %cmp, label %local, label %global
-
-local:
-  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
-  br label %end
-
-global:
-  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
-  br label %end
-
-end:
-  %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
-  store i32 %x, i32 addrspace(4)* %fptr, align 4
-;  %val = load i32, i32 addrspace(4)* %fptr, align 4
-;  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-
-
-; These testcases might become useless when there are optimizations to
-; remove generic pointers.
-
-; CHECK-LABEL: {{^}}store_flat_i32:
-; CHECK: v_mov_b32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}}
-; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}}
-; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}}
-; CHECK: flat_store_dword v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
-  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
-  store i32 %x, i32 addrspace(4)* %fptr, align 4
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_flat_i64:
-; CHECK: flat_store_dwordx2
-define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
-  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
-  store i64 %x, i64 addrspace(4)* %fptr, align 8
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_flat_v4i32:
-; CHECK: flat_store_dwordx4
-define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
-  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
-  store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_flat_trunc_i16:
-; CHECK: flat_store_short
-define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
-  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
-  %y = trunc i32 %x to i16
-  store i16 %y, i16 addrspace(4)* %fptr, align 2
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_flat_trunc_i8:
-; CHECK: flat_store_byte
-define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
-  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
-  %y = trunc i32 %x to i8
-  store i8 %y, i8 addrspace(4)* %fptr, align 2
-  ret void
-}
-
-
-
-; CHECK-LABEL @load_flat_i32:
-; CHECK: flat_load_dword
-define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
-  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
-  %fload = load i32, i32 addrspace(4)* %fptr, align 4
-  store i32 %fload, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; CHECK-LABEL @load_flat_i64:
-; CHECK: flat_load_dwordx2
-define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
-  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
-  %fload = load i64, i64 addrspace(4)* %fptr, align 4
-  store i64 %fload, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; CHECK-LABEL @load_flat_v4i32:
-; CHECK: flat_load_dwordx4
-define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
-  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
-  %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 4
-  store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; CHECK-LABEL @sextload_flat_i8:
-; CHECK: flat_load_sbyte
-define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
-  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
-  %fload = load i8, i8 addrspace(4)* %fptr, align 4
-  %ext = sext i8 %fload to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; CHECK-LABEL @zextload_flat_i8:
-; CHECK: flat_load_ubyte
-define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
-  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
-  %fload = load i8, i8 addrspace(4)* %fptr, align 4
-  %ext = zext i8 %fload to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; CHECK-LABEL @sextload_flat_i16:
-; CHECK: flat_load_sshort
-define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
-  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
-  %fload = load i16, i16 addrspace(4)* %fptr, align 4
-  %ext = sext i16 %fload to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; CHECK-LABEL @zextload_flat_i16:
-; CHECK: flat_load_ushort
-define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
-  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
-  %fload = load i16, i16 addrspace(4)* %fptr, align 4
-  %ext = zext i16 %fload to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-
-
-; TODO: This should not be zero when registers are used for small
-; scratch allocations again.
-
-; Check for prologue initializing special SGPRs pointing to scratch.
-; CHECK-LABEL: {{^}}store_flat_scratch:
-; CHECK: s_movk_i32 flat_scratch_lo, 0
-; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
-; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
-; CHECK: flat_store_dword
-; CHECK: s_barrier
-; CHECK: flat_load_dword
-define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
-  %alloca = alloca i32, i32 9, align 4
-  %x = call i32 @llvm.r600.read.tidig.x() #3
-  %pptr = getelementptr i32, i32* %alloca, i32 %x
-  %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
-  store i32 %x, i32 addrspace(4)* %fptr
-  ; Dummy call
-  call void @llvm.AMDGPU.barrier.local() #1
-  %reload = load i32, i32 addrspace(4)* %fptr, align 4
-  store i32 %reload, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-declare void @llvm.AMDGPU.barrier.local() #1
-declare i32 @llvm.r600.read.tidig.x() #3
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind noduplicate }
-attributes #3 = { nounwind readnone }
diff --git a/test/CodeGen/R600/floor.ll b/test/CodeGen/R600/floor.ll
deleted file mode 100644
index c6bfb8567a0..00000000000
--- a/test/CodeGen/R600/floor.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s
-
-; CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @test(<4 x float> inreg %reg0) #0 {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = call float @floor(float %r0)
-   %vec = insertelement <4 x float> undef, float %r1, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
-   ret void
-}
-
-declare float @floor(float) readonly
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/fma-combine.ll b/test/CodeGen/R600/fma-combine.ll
deleted file mode 100644
index bd574b87711..00000000000
--- a/test/CodeGen/R600/fma-combine.ll
+++ /dev/null
@@ -1,368 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare double @llvm.fabs.f64(double) #0
-declare double @llvm.fma.f64(double, double, double) #0
-declare float @llvm.fma.f32(float, float, float) #0
-
-; (fadd (fmul x, y), z) -> (fma x, y, z)
-; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
-
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-
-  %mul = fmul double %a, %b
-  %fma = fadd double %mul, %c
-  store double %fma, double addrspace(1)* %gep.out
-  ret void
-}
-
-; (fadd (fmul x, y), z) -> (fma x, y, z)
-; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
-; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
-; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
-; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI: s_endpgm
-define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-  %d = load double, double addrspace(1)* %gep.3
-
-  %mul = fmul double %a, %b
-  %fma0 = fadd double %mul, %c
-  %fma1 = fadd double %mul, %d
-  store double %fma0, double addrspace(1)* %gep.out.0
-  store double %fma1, double addrspace(1)* %gep.out.1
-  ret void
-}
-
-; (fadd x, (fmul y, z)) -> (fma y, z, x)
-; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
-
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-
-  %mul = fmul double %a, %b
-  %fma = fadd double %c, %mul
-  store double %fma, double addrspace(1)* %gep.out
-  ret void
-}
-
-; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
-
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-
-  %mul = fmul double %a, %b
-  %fma = fsub double %mul, %c
-  store double %fma, double addrspace(1)* %gep.out
-  ret void
-}
-
-; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
-; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
-; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
-; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI: s_endpgm
-define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-  %d = load double, double addrspace(1)* %gep.3
-
-  %mul = fmul double %a, %b
-  %fma0 = fsub double %mul, %c
-  %fma1 = fsub double %mul, %d
-  store double %fma0, double addrspace(1)* %gep.out.0
-  store double %fma1, double addrspace(1)* %gep.out.1
-  ret void
-}
-
-; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
-; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
-
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-
-  %mul = fmul double %a, %b
-  %fma = fsub double %c, %mul
-  store double %fma, double addrspace(1)* %gep.out
-  ret void
-}
-
-; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
-; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
-; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
-; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
-; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI: s_endpgm
-define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-  %d = load double, double addrspace(1)* %gep.3
-
-  %mul = fmul double %a, %b
-  %fma0 = fsub double %c, %mul
-  %fma1 = fsub double %d, %mul
-  store double %fma0, double addrspace(1)* %gep.out.0
-  store double %fma1, double addrspace(1)* %gep.out.1
-  ret void
-}
-
-; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
-
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-
-  %mul = fmul double %a, %b
-  %mul.neg = fsub double -0.0, %mul
-  %fma = fsub double %mul.neg, %c
-
-  store double %fma, double addrspace(1)* %gep.out
-  ret void
-}
-
-; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
-; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
-; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI: s_endpgm
-define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-  %d = load double, double addrspace(1)* %gep.3
-
-  %mul = fmul double %a, %b
-  %mul.neg = fsub double -0.0, %mul
-  %fma0 = fsub double %mul.neg, %c
-  %fma1 = fsub double %mul.neg, %d
-
-  store double %fma0, double addrspace(1)* %gep.out.0
-  store double %fma1, double addrspace(1)* %gep.out.1
-  ret void
-}
-
-; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
-; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
-; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI: s_endpgm
-define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-  %d = load double, double addrspace(1)* %gep.3
-
-  %mul = fmul double %a, %b
-  %mul.neg = fsub double -0.0, %mul
-  %fma0 = fsub double %mul.neg, %c
-  %fma1 = fsub double %mul, %d
-
-  store double %fma0, double addrspace(1)* %gep.out.0
-  store double %fma1, double addrspace(1)* %gep.out.1
-  ret void
-}
-
-; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
-
-; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
-; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
-; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
-; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
-; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
-  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
-  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
-
-  %x = load double, double addrspace(1)* %gep.0
-  %y = load double, double addrspace(1)* %gep.1
-  %z = load double, double addrspace(1)* %gep.2
-  %u = load double, double addrspace(1)* %gep.3
-  %v = load double, double addrspace(1)* %gep.4
-
-  %tmp0 = fmul double %u, %v
-  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
-  %tmp2 = fsub double %tmp1, %z
-
-  store double %tmp2, double addrspace(1)* %gep.out
-  ret void
-}
-
-; fold (fsub x, (fma y, z, (fmul u, v)))
-;   -> (fma (fneg y), z, (fma (fneg u), v, x))
-
-; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
-; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
-; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
-; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
-; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
-  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
-  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
-
-  %x = load double, double addrspace(1)* %gep.0
-  %y = load double, double addrspace(1)* %gep.1
-  %z = load double, double addrspace(1)* %gep.2
-  %u = load double, double addrspace(1)* %gep.3
-  %v = load double, double addrspace(1)* %gep.4
-
-  %tmp0 = fmul double %u, %v
-  %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
-  %tmp2 = fsub double %x, %tmp1
-
-  store double %tmp2, double addrspace(1)* %gep.out
-  ret void
-}
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/fma.f64.ll b/test/CodeGen/R600/fma.f64.ll
deleted file mode 100644
index 0a55ef77855..00000000000
--- a/test/CodeGen/R600/fma.f64.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare double @llvm.fma.f64(double, double, double) nounwind readnone
-declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
-
-
-; FUNC-LABEL: {{^}}fma_f64:
-; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2, double addrspace(1)* %in3) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
-   %r2 = load double, double addrspace(1)* %in3
-   %r3 = tail call double @llvm.fma.f64(double %r0, double %r1, double %r2)
-   store double %r3, double addrspace(1)* %out
-   ret void
-}
-
-; FUNC-LABEL: {{^}}fma_v2f64:
-; SI: v_fma_f64
-; SI: v_fma_f64
-define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
-                       <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
-   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
-   %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
-   %r2 = load <2 x double>, <2 x double> addrspace(1)* %in3
-   %r3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2)
-   store <2 x double> %r3, <2 x double> addrspace(1)* %out
-   ret void
-}
-
-; FUNC-LABEL: {{^}}fma_v4f64:
-; SI: v_fma_f64
-; SI: v_fma_f64
-; SI: v_fma_f64
-; SI: v_fma_f64
-define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
-                       <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
-   %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
-   %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
-   %r2 = load <4 x double>, <4 x double> addrspace(1)* %in3
-   %r3 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %r0, <4 x double> %r1, <4 x double> %r2)
-   store <4 x double> %r3, <4 x double> addrspace(1)* %out
-   ret void
-}
diff --git a/test/CodeGen/R600/fma.ll b/test/CodeGen/R600/fma.ll
deleted file mode 100644
index d6024aa0b4c..00000000000
--- a/test/CodeGen/R600/fma.ll
+++ /dev/null
@@ -1,92 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.fma.f32(float, float, float) nounwind readnone
-declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; FUNC-LABEL: {{^}}fma_f32:
-; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
-
-; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}},
-; EG: FMA {{\*? *}}[[RES]]
-define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
-                     float addrspace(1)* %in2, float addrspace(1)* %in3) {
-  %r0 = load float, float addrspace(1)* %in1
-  %r1 = load float, float addrspace(1)* %in2
-  %r2 = load float, float addrspace(1)* %in3
-  %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
-  store float %r3, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fma_v2f32:
-; SI: v_fma_f32
-; SI: v_fma_f32
-
-; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}},
-; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]]
-; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
-define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
-                       <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
-  %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1
-  %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2
-  %r2 = load <2 x float>, <2 x float> addrspace(1)* %in3
-  %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
-  store <2 x float> %r3, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fma_v4f32:
-; SI: v_fma_f32
-; SI: v_fma_f32
-; SI: v_fma_f32
-; SI: v_fma_f32
-
-; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}},
-; EG-DAG: FMA {{\*? *}}[[RES]].X
-; EG-DAG: FMA {{\*? *}}[[RES]].Y
-; EG-DAG: FMA {{\*? *}}[[RES]].Z
-; EG-DAG: FMA {{\*? *}}[[RES]].W
-define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
-                       <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
-  %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1
-  %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2
-  %r2 = load <4 x float>, <4 x float> addrspace(1)* %in3
-  %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
-  store <4 x float> %r3, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
-; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}}
-define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
-  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %in.a.gep, align 4
-  %b = load float, float addrspace(1)* %in.b.gep, align 4
-
-  %fma = call float @llvm.fma.f32(float %a, float 2.0, float %b)
-  store float %fma, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; FUNC-LABEL: @fma_commute_mul_s_f32
-define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
-  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %in.a.gep, align 4
-  %c = load float, float addrspace(1)* %in.b.gep, align 4
-
-  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
-  store float %fma, float addrspace(1)* %out.gep, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/fmad.ll b/test/CodeGen/R600/fmad.ll
deleted file mode 100644
index 935e35123f4..00000000000
--- a/test/CodeGen/R600/fmad.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @test(<4 x float> inreg %reg0) #0 {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = extractelement <4 x float> %reg0, i32 1
-   %r2 = extractelement <4 x float> %reg0, i32 2
-   %r3 = fmul float %r0, %r1
-   %r4 = fadd float %r3, %r2
-   %vec = insertelement <4 x float> undef, float %r4, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
-   ret void
-}
-
-declare float @fabs(float ) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
diff --git a/test/CodeGen/R600/fmax.ll b/test/CodeGen/R600/fmax.ll
deleted file mode 100644
index d7127f485c7..00000000000
--- a/test/CodeGen/R600/fmax.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @test(<4 x float> inreg %reg0) #0 {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = extractelement <4 x float> %reg0, i32 1
-   %r2 = fcmp oge float %r0, %r1
-   %r3 = select i1 %r2, float %r0, float %r1
-   %vec = insertelement <4 x float> undef, float %r3, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
-   ret void
-}
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
diff --git a/test/CodeGen/R600/fmax3.f64.ll b/test/CodeGen/R600/fmax3.f64.ll
deleted file mode 100644
index f78c71b2826..00000000000
--- a/test/CodeGen/R600/fmax3.f64.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare double @llvm.maxnum.f64(double, double) nounwind readnone
-
-; SI-LABEL: {{^}}test_fmax3_f64:
-; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0{{$}}
-; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:8
-; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:16
-; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]]
-; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
-; SI: buffer_store_dwordx2 [[RESULT]],
-; SI: s_endpgm
-define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
-  %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1
-  %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2
-  %a = load double, double addrspace(1)* %aptr, align 8
-  %b = load double, double addrspace(1)* %bptr, align 8
-  %c = load double, double addrspace(1)* %cptr, align 8
-  %f0 = call double @llvm.maxnum.f64(double %a, double %b) nounwind readnone
-  %f1 = call double @llvm.maxnum.f64(double %f0, double %c) nounwind readnone
-  store double %f1, double addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/fmax3.ll b/test/CodeGen/R600/fmax3.ll
deleted file mode 100644
index c3028a6217d..00000000000
--- a/test/CodeGen/R600/fmax3.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.maxnum.f32(float, float) nounwind readnone
-
-; SI-LABEL: {{^}}test_fmax3_olt_0:
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float, float addrspace(1)* %aptr, align 4
-  %b = load float, float addrspace(1)* %bptr, align 4
-  %c = load float, float addrspace(1)* %cptr, align 4
-  %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
-  %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone
-  store float %f1, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; Commute operand of second fmax
-; SI-LABEL: {{^}}test_fmax3_olt_1:
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float, float addrspace(1)* %aptr, align 4
-  %b = load float, float addrspace(1)* %bptr, align 4
-  %c = load float, float addrspace(1)* %cptr, align 4
-  %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
-  %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone
-  store float %f1, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/fmax_legacy.f64.ll b/test/CodeGen/R600/fmax_legacy.f64.ll
deleted file mode 100644
index 828243888ac..00000000000
--- a/test/CodeGen/R600/fmax_legacy.f64.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; Make sure we don't try to form FMAX_LEGACY nodes with f64
-
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; FUNC-LABEL: @test_fmax_legacy_uge_f64
-define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
-
-  %cmp = fcmp uge double %a, %b
-  %val = select i1 %cmp, double %a, double %b
-  store double %val, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: @test_fmax_legacy_oge_f64
-define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
-
-  %cmp = fcmp oge double %a, %b
-  %val = select i1 %cmp, double %a, double %b
-  store double %val, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: @test_fmax_legacy_ugt_f64
-define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
-
-  %cmp = fcmp ugt double %a, %b
-  %val = select i1 %cmp, double %a, double %b
-  store double %val, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: @test_fmax_legacy_ogt_f64
-define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
-
-  %cmp = fcmp ogt double %a, %b
-  %val = select i1 %cmp, double %a, double %b
-  store double %val, double addrspace(1)* %out, align 8
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmax_legacy.ll b/test/CodeGen/R600/fmax_legacy.ll
deleted file mode 100644
index 413957d2982..00000000000
--- a/test/CodeGen/R600/fmax_legacy.ll
+++ /dev/null
@@ -1,116 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FIXME: Should replace unsafe-fp-math with no signed zeros.
-
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; FUNC-LABEL: @test_fmax_legacy_uge_f32
-; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-
-; EG: MAX
-define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %cmp = fcmp uge float %a, %b
-  %val = select i1 %cmp, float %a, float %b
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @test_fmax_legacy_oge_f32
-; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; EG: MAX
-define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %cmp = fcmp oge float %a, %b
-  %val = select i1 %cmp, float %a, float %b
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @test_fmax_legacy_ugt_f32
-; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; EG: MAX
-define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %cmp = fcmp ugt float %a, %b
-  %val = select i1 %cmp, float %a, float %b
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @test_fmax_legacy_ogt_f32
-; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; EG: MAX
-define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %cmp = fcmp ogt float %a, %b
-  %val = select i1 %cmp, float %a, float %b
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-
-; FUNC-LABEL: @test_fmax_legacy_ogt_f32_multi_use
-; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-NOT: v_max_
-; SI: v_cmp_gt_f32
-; SI-NEXT: v_cndmask_b32
-; SI-NOT: v_max_
-
-; EG: MAX
-define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %cmp = fcmp ogt float %a, %b
-  %val = select i1 %cmp, float %a, float %b
-  store float %val, float addrspace(1)* %out0, align 4
-  store i1 %cmp, i1addrspace(1)* %out1
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmaxnum.f64.ll b/test/CodeGen/R600/fmaxnum.f64.ll
deleted file mode 100644
index de563cec341..00000000000
--- a/test/CodeGen/R600/fmaxnum.f64.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare double @llvm.maxnum.f64(double, double) #0
-declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) #0
-declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) #0
-declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>) #0
-declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>) #0
-
-; FUNC-LABEL: @test_fmax_f64
-; SI: v_max_f64
-define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
-  %val = call double @llvm.maxnum.f64(double %a, double %b) #0
-  store double %val, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: @test_fmax_v2f64
-; SI: v_max_f64
-; SI: v_max_f64
-define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
-  %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0
-  store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: @test_fmax_v4f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
-  %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0
-  store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
-  ret void
-}
-
-; FUNC-LABEL: @test_fmax_v8f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
-  %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0
-  store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
-  ret void
-}
-
-; FUNC-LABEL: @test_fmax_v16f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-; SI: v_max_f64
-define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
-  %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0
-  store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
-  ret void
-}
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmaxnum.ll b/test/CodeGen/R600/fmaxnum.ll
deleted file mode 100644
index 3029bd02e4d..00000000000
--- a/test/CodeGen/R600/fmaxnum.ll
+++ /dev/null
@@ -1,283 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare float @llvm.maxnum.f32(float, float) #0
-declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0
-declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0
-declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0
-declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0
-
-declare double @llvm.maxnum.f64(double, double)
-
-; FUNC-LABEL: @test_fmax_f32
-; SI: v_max_f32_e32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
-  %val = call float @llvm.maxnum.f32(float %a, float %b) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @test_fmax_v2f32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
-; EG: MAX_DX10 {{.*}}[[OUT]]
-; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
-  %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0
-  store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: @test_fmax_v4f32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
-; EG: MAX_DX10 {{.*}}[[OUT]]
-; EG: MAX_DX10 {{.*}}[[OUT]]
-; EG: MAX_DX10 {{.*}}[[OUT]]
-; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
-  %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0
-  store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: @test_fmax_v8f32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
-; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X
-; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y
-; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z
-; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W
-; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X
-; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
-; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
-; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
-define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
-  %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0
-  store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
-  ret void
-}
-
-; FUNC-LABEL: @test_fmax_v16f32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-; SI: v_max_f32_e32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]]
-; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X
-; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y
-; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z
-; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W
-; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X
-; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
-; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
-; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
-; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].X
-; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Y
-; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Z
-; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].W
-; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].X
-; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y
-; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z
-; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W
-define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
-  %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0
-  store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmax_f32
-; SI-NOT: v_max_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MAX_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmax_f32_nan_nan
-; SI-NOT: v_max_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MAX_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-; EG: 2143289344(nan)
-define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmax_f32_val_nan
-; SI-NOT: v_max_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MAX_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmax_f32_nan_val
-; SI-NOT: v_max_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MAX_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmax_f32_p0_p0
-; SI-NOT: v_max_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MAX_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmax_f32_p0_n0
-; SI-NOT: v_max_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MAX_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmax_f32_n0_p0
-; SI-NOT: v_max_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MAX_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmax_f32_n0_n0
-; SI-NOT: v_max_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MAX_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @fmax_var_immediate_f32
-; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MAX_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
-  %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @fmax_immediate_var_f32
-; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
-  %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @fmax_var_literal_f32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
-; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
-  %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @fmax_literal_var_f32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
-; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
-  %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmin.ll b/test/CodeGen/R600/fmin.ll
deleted file mode 100644
index defa8c09638..00000000000
--- a/test/CodeGen/R600/fmin.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @test(<4 x float> inreg %reg0) #0 {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = extractelement <4 x float> %reg0, i32 1
-   %r2 = fcmp uge float %r0, %r1
-   %r3 = select i1 %r2, float %r1, float %r0
-   %vec = insertelement <4 x float> undef, float %r3, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
-   ret void
-}
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
diff --git a/test/CodeGen/R600/fmin3.ll b/test/CodeGen/R600/fmin3.ll
deleted file mode 100644
index 0a76699b43e..00000000000
--- a/test/CodeGen/R600/fmin3.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.minnum.f32(float, float) nounwind readnone
-
-; SI-LABEL: {{^}}test_fmin3_olt_0:
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float, float addrspace(1)* %aptr, align 4
-  %b = load float, float addrspace(1)* %bptr, align 4
-  %c = load float, float addrspace(1)* %cptr, align 4
-  %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
-  %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone
-  store float %f1, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; Commute operand of second fmin
-; SI-LABEL: {{^}}test_fmin3_olt_1:
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float, float addrspace(1)* %aptr, align 4
-  %b = load float, float addrspace(1)* %bptr, align 4
-  %c = load float, float addrspace(1)* %cptr, align 4
-  %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
-  %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone
-  store float %f1, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/fmin_legacy.f64.ll b/test/CodeGen/R600/fmin_legacy.f64.ll
deleted file mode 100644
index e19a48f3f7e..00000000000
--- a/test/CodeGen/R600/fmin_legacy.f64.ll
+++ /dev/null
@@ -1,77 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; FUNC-LABEL: @test_fmin_legacy_f64
-define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
-   %r0 = extractelement <4 x double> %reg0, i32 0
-   %r1 = extractelement <4 x double> %reg0, i32 1
-   %r2 = fcmp uge double %r0, %r1
-   %r3 = select i1 %r2, double %r1, double %r0
-   %vec = insertelement <4 x double> undef, double %r3, i32 0
-   store <4 x double> %vec, <4 x double> addrspace(1)* %out, align 16
-   ret void
-}
-
-; FUNC-LABEL: @test_fmin_legacy_ule_f64
-define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
-
-  %cmp = fcmp ule double %a, %b
-  %val = select i1 %cmp, double %a, double %b
-  store double %val, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_legacy_ole_f64
-define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
-
-  %cmp = fcmp ole double %a, %b
-  %val = select i1 %cmp, double %a, double %b
-  store double %val, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_legacy_olt_f64
-define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
-
-  %cmp = fcmp olt double %a, %b
-  %val = select i1 %cmp, double %a, double %b
-  store double %val, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_legacy_ult_f64
-define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
-
-  %cmp = fcmp ult double %a, %b
-  %val = select i1 %cmp, double %a, double %b
-  store double %val, double addrspace(1)* %out, align 8
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmin_legacy.ll b/test/CodeGen/R600/fmin_legacy.ll
deleted file mode 100644
index 6a625c239d7..00000000000
--- a/test/CodeGen/R600/fmin_legacy.ll
+++ /dev/null
@@ -1,123 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math  -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FIXME: Should replace unsafe-fp-math with no signed zeros.
-
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; FUNC-LABEL: @test_fmin_legacy_f32
-; EG: MIN *
-; SI-SAFE: v_min_legacy_f32_e32
-; SI-NONAN: v_min_f32_e32
-define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = extractelement <4 x float> %reg0, i32 1
-   %r2 = fcmp uge float %r0, %r1
-   %r3 = select i1 %r2, float %r1, float %r0
-   %vec = insertelement <4 x float> undef, float %r3, i32 0
-   store <4 x float> %vec, <4 x float> addrspace(1)* %out, align 16
-   ret void
-}
-
-; FUNC-LABEL: @test_fmin_legacy_ule_f32
-; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %cmp = fcmp ule float %a, %b
-  %val = select i1 %cmp, float %a, float %b
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_legacy_ole_f32
-; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %cmp = fcmp ole float %a, %b
-  %val = select i1 %cmp, float %a, float %b
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_legacy_olt_f32
-; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %cmp = fcmp olt float %a, %b
-  %val = select i1 %cmp, float %a, float %b
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_legacy_ult_f32
-; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %cmp = fcmp ult float %a, %b
-  %val = select i1 %cmp, float %a, float %b
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_legacy_ole_f32_multi_use
-; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-NOT: v_min
-; SI: v_cmp_le_f32
-; SI-NEXT: v_cndmask_b32
-; SI-NOT: v_min
-; SI: s_endpgm
-define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %cmp = fcmp ole float %a, %b
-  %val0 = select i1 %cmp, float %a, float %b
-  store float %val0, float addrspace(1)* %out0, align 4
-  store i1 %cmp, i1 addrspace(1)* %out1
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fminnum.f64.ll b/test/CodeGen/R600/fminnum.f64.ll
deleted file mode 100644
index 0f929d6a81f..00000000000
--- a/test/CodeGen/R600/fminnum.f64.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare double @llvm.minnum.f64(double, double) #0
-declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) #0
-declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) #0
-declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0
-declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0
-
-; FUNC-LABEL: @test_fmin_f64
-; SI: v_min_f64
-define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
-  %val = call double @llvm.minnum.f64(double %a, double %b) #0
-  store double %val, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_v2f64
-; SI: v_min_f64
-; SI: v_min_f64
-define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
-  %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0
-  store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_v4f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
-  %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0
-  store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_v8f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
-  %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0
-  store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_v16f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-; SI: v_min_f64
-define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
-  %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0
-  store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
-  ret void
-}
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fminnum.ll b/test/CodeGen/R600/fminnum.ll
deleted file mode 100644
index 4d7b52540d8..00000000000
--- a/test/CodeGen/R600/fminnum.ll
+++ /dev/null
@@ -1,281 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.minnum.f32(float, float) #0
-declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0
-declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0
-declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0
-declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0
-
-; FUNC-LABEL: @test_fmin_f32
-; SI: v_min_f32_e32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
-  %val = call float @llvm.minnum.f32(float %a, float %b) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_v2f32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
-; EG: MIN_DX10 {{.*}}[[OUT]]
-; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
-  %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0
-  store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_v4f32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
-; EG: MIN_DX10 {{.*}}[[OUT]]
-; EG: MIN_DX10 {{.*}}[[OUT]]
-; EG: MIN_DX10 {{.*}}[[OUT]]
-; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
-  %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0
-  store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_v8f32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
-; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X
-; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y
-; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z
-; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W
-; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X
-; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
-; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
-; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
-define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
-  %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0
-  store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
-  ret void
-}
-
-; FUNC-LABEL: @test_fmin_v16f32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-; SI: v_min_f32_e32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]]
-; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X
-; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y
-; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z
-; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W
-; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X
-; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
-; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
-; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
-; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].X
-; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Y
-; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Z
-; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].W
-; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].X
-; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y
-; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z
-; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W
-define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
-  %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0
-  store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmin_f32
-; SI-NOT: v_min_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MIN_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmin_f32_nan_nan
-; SI-NOT: v_min_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MIN_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-; EG: 2143289344({{nan|1\.#QNAN0e\+00}})
-define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmin_f32_val_nan
-; SI-NOT: v_min_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MIN_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmin_f32_nan_val
-; SI-NOT: v_min_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MIN_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmin_f32_p0_p0
-; SI-NOT: v_min_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MIN_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmin_f32_p0_n0
-; SI-NOT: v_min_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MIN_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmin_f32_n0_p0
-; SI-NOT: v_min_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MIN_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @constant_fold_fmin_f32_n0_n0
-; SI-NOT: v_min_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
-; SI: buffer_store_dword [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG-NOT: MIN_DX10
-; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
-  %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @fmin_var_immediate_f32
-; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
-  %val = call float @llvm.minnum.f32(float %a, float 2.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @fmin_immediate_var_f32
-; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
-  %val = call float @llvm.minnum.f32(float 2.0, float %a) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @fmin_var_literal_f32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
-; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
-  %val = call float @llvm.minnum.f32(float %a, float 99.0) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @fmin_literal_var_f32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
-; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
-; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
-  %val = call float @llvm.minnum.f32(float 99.0, float %a) #0
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
-}
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
deleted file mode 100644
index addc409c9eb..00000000000
--- a/test/CodeGen/R600/fmul.ll
+++ /dev/null
@@ -1,92 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-
-; FUNC-LABEL: {{^}}fmul_f32:
-; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
-
-; SI: v_mul_f32
-define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fmul float %a, %b
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
-; FUNC-LABEL: {{^}}fmul_v2f32:
-; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
-; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
-
-; SI: v_mul_f32
-; SI: v_mul_f32
-define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
-entry:
-  %0 = fmul <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fmul_v4f32:
-; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_mul_f32
-; SI: v_mul_f32
-; SI: v_mul_f32
-; SI: v_mul_f32
-define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
-  %a = load <4 x float>, <4 x float> addrspace(1) * %in
-  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
-  %result = fmul <4 x float> %a, %b
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_mul_2_k:
-; SI: v_mul_f32
-; SI-NOT: v_mul_f32
-; SI: s_endpgm
-define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
-  %y = fmul float %x, 2.0
-  %z = fmul float %y, 3.0
-  store float %z, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_mul_2_k_inv:
-; SI: v_mul_f32
-; SI-NOT: v_mul_f32
-; SI-NOT: v_mad_f32
-; SI: s_endpgm
-define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
-  %y = fmul float %x, 3.0
-  %z = fmul float %y, 2.0
-  store float %z, float addrspace(1)* %out
-  ret void
-}
-
-; There should be three multiplies here; %a should be used twice (once
-; negated), not duplicated into mul x, 5.0 and mul x, -5.0.
-; FUNC-LABEL: {{^}}test_mul_twouse:
-; SI: v_mul_f32
-; SI: v_mul_f32
-; SI: v_mul_f32
-; SI-NOT: v_mul_f32
-define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
-  %a = fmul float %x, 5.0
-  %b = fsub float -0.0, %a
-  %c = fmul float %b, %y
-  %d = fmul float %c, %a
-  store float %d, float addrspace(1)* %out
-  ret void
-}
-
-attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/R600/fmul64.ll b/test/CodeGen/R600/fmul64.ll
deleted file mode 100644
index 3c222eaba89..00000000000
--- a/test/CodeGen/R600/fmul64.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
-
-; FUNC-LABEL: {{^}}fmul_f64:
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                      double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
-   %r2 = fmul double %r0, %r1
-   store double %r2, double addrspace(1)* %out
-   ret void
-}
-
-; FUNC-LABEL: {{^}}fmul_v2f64:
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
-                        <2 x double> addrspace(1)* %in2) {
-   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
-   %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
-   %r2 = fmul <2 x double> %r0, %r1
-   store <2 x double> %r2, <2 x double> addrspace(1)* %out
-   ret void
-}
-
-; FUNC-LABEL: {{^}}fmul_v4f64:
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
-                        <4 x double> addrspace(1)* %in2) {
-   %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
-   %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
-   %r2 = fmul <4 x double> %r0, %r1
-   store <4 x double> %r2, <4 x double> addrspace(1)* %out
-   ret void
-}
diff --git a/test/CodeGen/R600/fmuladd.ll b/test/CodeGen/R600/fmuladd.ll
deleted file mode 100644
index ae84d841021..00000000000
--- a/test/CodeGen/R600/fmuladd.ll
+++ /dev/null
@@ -1,199 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
-
-declare float @llvm.fmuladd.f32(float, float, float)
-declare double @llvm.fmuladd.f64(double, double, double)
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-declare float @llvm.fabs.f32(float) nounwind readnone
-
-; CHECK-LABEL: {{^}}fmuladd_f32:
-; CHECK: v_mad_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
-                         float addrspace(1)* %in2, float addrspace(1)* %in3) {
-   %r0 = load float, float addrspace(1)* %in1
-   %r1 = load float, float addrspace(1)* %in2
-   %r2 = load float, float addrspace(1)* %in3
-   %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2)
-   store float %r3, float addrspace(1)* %out
-   ret void
-}
-
-; CHECK-LABEL: {{^}}fmuladd_f64:
-; CHECK: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-
-define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                         double addrspace(1)* %in2, double addrspace(1)* %in3) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
-   %r2 = load double, double addrspace(1)* %in3
-   %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2)
-   store double %r3, double addrspace(1)* %out
-   ret void
-}
-
-; CHECK-LABEL: {{^}}fmuladd_2.0_a_b_f32
-; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
-define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
-
-  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
-  store float %r3, float addrspace(1)* %gep.out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}fmuladd_a_2.0_b_f32
-; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
-define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
-
-  %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
-  store float %r3, float addrspace(1)* %gep.out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}fadd_a_a_b_f32:
-; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
-define void @fadd_a_a_b_f32(float addrspace(1)* %out,
-                            float addrspace(1)* %in1,
-                            float addrspace(1)* %in2) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %r0 = load float, float addrspace(1)* %gep.0
-  %r1 = load float, float addrspace(1)* %gep.1
-
-  %add.0 = fadd float %r0, %r0
-  %add.1 = fadd float %add.0, %r1
-  store float %add.1, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}fadd_b_a_a_f32:
-; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
-define void @fadd_b_a_a_f32(float addrspace(1)* %out,
-                            float addrspace(1)* %in1,
-                            float addrspace(1)* %in2) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %r0 = load float, float addrspace(1)* %gep.0
-  %r1 = load float, float addrspace(1)* %gep.1
-
-  %add.0 = fadd float %r0, %r0
-  %add.1 = fadd float %r1, %add.0
-  store float %add.1, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
-; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
-define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
-
-  %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
-  store float %r3, float addrspace(1)* %gep.out
-  ret void
-}
-
-
-; CHECK-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
-; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
-define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
-
-  %r1.fneg = fsub float -0.000000e+00, %r1
-
-  %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2)
-  store float %r3, float addrspace(1)* %gep.out
-  ret void
-}
-
-
-; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32
-; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
-define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
-
-  %r1.fneg = fsub float -0.000000e+00, %r1
-
-  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2)
-  store float %r3, float addrspace(1)* %gep.out
-  ret void
-}
-
-
-; CHECK-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32
-; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
-define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
-
-  %r2.fneg = fsub float -0.000000e+00, %r2
-
-  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg)
-  store float %r3, float addrspace(1)* %gep.out
-  ret void
-}
diff --git a/test/CodeGen/R600/fnearbyint.ll b/test/CodeGen/R600/fnearbyint.ll
deleted file mode 100644
index 4fa9adaabda..00000000000
--- a/test/CodeGen/R600/fnearbyint.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s
-
-; This should have the exactly the same output as the test for rint,
-; so no need to check anything.
-
-declare float @llvm.nearbyint.f32(float) #0
-declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #0
-declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #0
-declare double @llvm.nearbyint.f64(double) #0
-declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0
-declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0
-
-
-define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 {
-entry:
-  %0 = call float @llvm.nearbyint.f32(float %in)
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
-entry:
-  %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in)
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
-entry:
-  %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in)
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-define void @nearbyint_f64(double addrspace(1)* %out, double %in) {
-entry:
-  %0 = call double @llvm.nearbyint.f64(double %in)
-  store double %0, double addrspace(1)* %out
-  ret void
-}
-define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
-entry:
-  %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in)
-  store <2 x double> %0, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
-entry:
-  %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in)
-  store <4 x double> %0, <4 x double> addrspace(1)* %out
-  ret void
-}
-
-attributes #0 = { nounwind readonly }
-attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/fneg-fabs.f64.ll b/test/CodeGen/R600/fneg-fabs.f64.ll
deleted file mode 100644
index 8830e827366..00000000000
--- a/test/CodeGen/R600/fneg-fabs.f64.ll
+++ /dev/null
@@ -1,100 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FIXME: Check something here. Currently it seems fabs + fneg aren't
-; into 2 modifiers, although theoretically that should work.
-
-; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
-define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
-  %fabs = call double @llvm.fabs.f64(double %x)
-  %fsub = fsub double -0.000000e+00, %fabs
-  %fadd = fadd double %y, %fsub
-  store double %fadd, double addrspace(1)* %out, align 8
-  ret void
-}
-
-define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) {
-  %x = load double, double addrspace(1)* %xptr, align 8
-  %y = load double, double addrspace(1)* %xptr, align 8
-  %fabs = call double @llvm.fabs.f64(double %x)
-  %fsub = fsub double -0.000000e+00, %fabs
-  %fadd = fadd double %y, %fsub
-  store double %fadd, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_fabs_fmul_f64:
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|
-define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
-  %fabs = call double @llvm.fabs.f64(double %x)
-  %fsub = fsub double -0.000000e+00, %fabs
-  %fmul = fmul double %y, %fsub
-  store double %fmul, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_fabs_free_f64:
-define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
-  %bc = bitcast i64 %in to double
-  %fabs = call double @llvm.fabs.f64(double %bc)
-  %fsub = fsub double -0.000000e+00, %fabs
-  store double %fsub, double addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f64:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
-  %bc = bitcast i64 %in to double
-  %fabs = call double @fabs(double %bc)
-  %fsub = fsub double -0.000000e+00, %fabs
-  store double %fsub, double addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_fabs_f64:
-; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
-; SI: s_load_dwordx2
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
-; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
-; SI: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
-define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
-  %fabs = call double @llvm.fabs.f64(double %in)
-  %fsub = fsub double -0.000000e+00, %fabs
-  store double %fsub, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_fabs_v2f64:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI-NOT: 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
-  %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
-  %fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
-  store <2 x double> %fsub, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_fabs_v4f64:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI-NOT: 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
-  %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
-  %fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs
-  store <4 x double> %fsub, <4 x double> addrspace(1)* %out
-  ret void
-}
-
-declare double @fabs(double) readnone
-declare double @llvm.fabs.f64(double) readnone
-declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone
-declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
diff --git a/test/CodeGen/R600/fneg-fabs.ll b/test/CodeGen/R600/fneg-fabs.ll
deleted file mode 100644
index 3b4930d9897..00000000000
--- a/test/CodeGen/R600/fneg-fabs.ll
+++ /dev/null
@@ -1,118 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
-; SI-NOT: and
-; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
-define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
-  %fabs = call float @llvm.fabs.f32(float %x)
-  %fsub = fsub float -0.000000e+00, %fabs
-  %fadd = fadd float %y, %fsub
-  store float %fadd, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32:
-; SI-NOT: and
-; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}|
-; SI-NOT: and
-define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
-  %fabs = call float @llvm.fabs.f32(float %x)
-  %fsub = fsub float -0.000000e+00, %fabs
-  %fmul = fmul float %y, %fsub
-  store float %fmul, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; DAGCombiner will transform:
-; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
-; unless isFabsFree returns true
-
-; FUNC-LABEL: {{^}}fneg_fabs_free_f32:
-; R600-NOT: AND
-; R600: |PV.{{[XYZW]}}|
-; R600: -PV
-
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
-  %bc = bitcast i32 %in to float
-  %fabs = call float @llvm.fabs.f32(float %bc)
-  %fsub = fsub float -0.000000e+00, %fabs
-  store float %fsub, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f32:
-; R600-NOT: AND
-; R600: |PV.{{[XYZW]}}|
-; R600: -PV
-
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
-  %bc = bitcast i32 %in to float
-  %fabs = call float @fabs(float %bc)
-  %fsub = fsub float -0.000000e+00, %fabs
-  store float %fsub, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_fabs_f32:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
-  %fabs = call float @llvm.fabs.f32(float %in)
-  %fsub = fsub float -0.000000e+00, %fabs
-  store float %fsub, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_fneg_fabs_f32:
-; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %val = load float, float addrspace(1)* %in, align 4
-  %fabs = call float @llvm.fabs.f32(float %val)
-  %fsub = fsub float -0.000000e+00, %fabs
-  store float %fsub, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_fabs_v2f32:
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600: -PV
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600: -PV
-
-; FIXME: SGPR should be used directly for first src operand.
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI-NOT: 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
-  %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
-  store <2 x float> %fsub, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FIXME: SGPR should be used directly for first src operand.
-; FUNC-LABEL: {{^}}fneg_fabs_v4f32:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI-NOT: 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-  %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
-  %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
-  store <4 x float> %fsub, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-declare float @fabs(float) readnone
-declare float @llvm.fabs.f32(float) readnone
-declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
-declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/R600/fneg.f64.ll b/test/CodeGen/R600/fneg.f64.ll
deleted file mode 100644
index aa6df209035..00000000000
--- a/test/CodeGen/R600/fneg.f64.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}fneg_f64:
-; GCN: v_xor_b32
-define void @fneg_f64(double addrspace(1)* %out, double %in) {
-  %fneg = fsub double -0.000000e+00, %in
-  store double %fneg, double addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_v2f64:
-; GCN: v_xor_b32
-; GCN: v_xor_b32
-define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
-  %fneg = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %in
-  store <2 x double> %fneg, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_v4f64:
-; R600: -PV
-; R600: -T
-; R600: -PV
-; R600: -PV
-
-; GCN: v_xor_b32
-; GCN: v_xor_b32
-; GCN: v_xor_b32
-; GCN: v_xor_b32
-define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
-  %fneg = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %in
-  store <4 x double> %fneg, <4 x double> addrspace(1)* %out
-  ret void
-}
-
-; DAGCombiner will transform:
-; (fneg (f64 bitcast (i64 a))) => (f64 bitcast (xor (i64 a), 0x80000000))
-; unless the target returns true for isNegFree()
-
-; FUNC-LABEL: {{^}}fneg_free_f64:
-; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, 0, -{{s\[[0-9]+:[0-9]+\]$}}
-define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
-  %bc = bitcast i64 %in to double
-  %fsub = fsub double 0.0, %bc
-  store double %fsub, double addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}fneg_fold_f64:
-; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; GCN-NOT: xor
-; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
-  %fsub = fsub double -0.0, %in
-  %fmul = fmul double %fsub, %in
-  store double %fmul, double addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fneg.ll b/test/CodeGen/R600/fneg.ll
deleted file mode 100644
index a0fd539863c..00000000000
--- a/test/CodeGen/R600/fneg.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}fneg_f32:
-; R600: -PV
-
-; GCN: v_xor_b32
-define void @fneg_f32(float addrspace(1)* %out, float %in) {
-  %fneg = fsub float -0.000000e+00, %in
-  store float %fneg, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_v2f32:
-; R600: -PV
-; R600: -PV
-
-; GCN: v_xor_b32
-; GCN: v_xor_b32
-define void @fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
-  %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
-  store <2 x float> %fneg, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_v4f32:
-; R600: -PV
-; R600: -T
-; R600: -PV
-; R600: -PV
-
-; GCN: v_xor_b32
-; GCN: v_xor_b32
-; GCN: v_xor_b32
-; GCN: v_xor_b32
-define void @fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
-  %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
-  store <4 x float> %fneg, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; DAGCombiner will transform:
-; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000))
-; unless the target returns true for isNegFree()
-
-; FUNC-LABEL: {{^}}fneg_free_f32:
-; R600-NOT: XOR
-; R600: -KC0[2].Z
-
-; XXX: We could use v_add_f32_e64 with the negate bit here instead.
-; GCN: v_sub_f32_e64 v{{[0-9]}}, 0, s{{[0-9]+$}}
-define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
-  %bc = bitcast i32 %in to float
-  %fsub = fsub float 0.0, %bc
-  store float %fsub, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fneg_fold_f32:
-; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
-; GCN-NOT: xor
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
-  %fsub = fsub float -0.0, %in
-  %fmul = fmul float %fsub, %in
-  store float %fmul, float addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fp-classify.ll b/test/CodeGen/R600/fp-classify.ll
deleted file mode 100644
index 4fac5176fac..00000000000
--- a/test/CodeGen/R600/fp-classify.ll
+++ /dev/null
@@ -1,131 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare i1 @llvm.AMDGPU.class.f32(float, i32) #1
-declare i1 @llvm.AMDGPU.class.f64(double, i32) #1
-declare i32 @llvm.r600.read.tidig.x() #1
-declare float @llvm.fabs.f32(float) #1
-declare double @llvm.fabs.f64(double) #1
-
-; SI-LABEL: {{^}}test_isinf_pattern:
-; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x204{{$}}
-; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
-; SI-NOT: v_cmp
-; SI: s_endpgm
-define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
-  %fabs = tail call float @llvm.fabs.f32(float %x) #1
-  %cmp = fcmp oeq float %fabs, 0x7FF0000000000000
-  %ext = zext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_not_isinf_pattern_0:
-; SI-NOT: v_cmp_class
-; SI: s_endpgm
-define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
-  %fabs = tail call float @llvm.fabs.f32(float %x) #1
-  %cmp = fcmp ueq float %fabs, 0x7FF0000000000000
-  %ext = zext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_not_isinf_pattern_1:
-; SI-NOT: v_cmp_class
-; SI: s_endpgm
-define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
-  %fabs = tail call float @llvm.fabs.f32(float %x) #1
-  %cmp = fcmp oeq float %fabs, 0xFFF0000000000000
-  %ext = zext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_isfinite_pattern_0:
-; SI-NOT: v_cmp
-; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1f8{{$}}
-; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
-; SI-NOT: v_cmp
-; SI: s_endpgm
-define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
-  %ord = fcmp ord float %x, 0.000000e+00
-  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
-  %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
-  %and = and i1 %ord, %ninf
-  %ext = zext i1 %and to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; Use negative infinity
-; SI-LABEL: {{^}}test_isfinite_not_pattern_0:
-; SI-NOT: v_cmp_class_f32
-; SI: s_endpgm
-define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
-  %ord = fcmp ord float %x, 0.000000e+00
-  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
-  %ninf = fcmp une float %x.fabs, 0xFFF0000000000000
-  %and = and i1 %ord, %ninf
-  %ext = zext i1 %and to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; No fabs
-; SI-LABEL: {{^}}test_isfinite_not_pattern_1:
-; SI-NOT: v_cmp_class_f32
-; SI: s_endpgm
-define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
-  %ord = fcmp ord float %x, 0.000000e+00
-  %ninf = fcmp une float %x, 0x7FF0000000000000
-  %and = and i1 %ord, %ninf
-  %ext = zext i1 %and to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; fabs of different value
-; SI-LABEL: {{^}}test_isfinite_not_pattern_2:
-; SI-NOT: v_cmp_class_f32
-; SI: s_endpgm
-define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 {
-  %ord = fcmp ord float %x, 0.000000e+00
-  %x.fabs = tail call float @llvm.fabs.f32(float %y) #1
-  %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
-  %and = and i1 %ord, %ninf
-  %ext = zext i1 %and to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; Wrong ordered compare type
-; SI-LABEL: {{^}}test_isfinite_not_pattern_3:
-; SI-NOT: v_cmp_class_f32
-; SI: s_endpgm
-define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 {
-  %ord = fcmp uno float %x, 0.000000e+00
-  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
-  %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
-  %and = and i1 %ord, %ninf
-  %ext = zext i1 %and to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; Wrong unordered compare
-; SI-LABEL: {{^}}test_isfinite_not_pattern_4:
-; SI-NOT: v_cmp_class_f32
-; SI: s_endpgm
-define void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 {
-  %ord = fcmp ord float %x, 0.000000e+00
-  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
-  %ninf = fcmp one float %x.fabs, 0x7FF0000000000000
-  %and = and i1 %ord, %ninf
-  %ext = zext i1 %and to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fp16_to_fp.ll b/test/CodeGen/R600/fp16_to_fp.ll
deleted file mode 100644
index 5a79ca82bc2..00000000000
--- a/test/CodeGen/R600/fp16_to_fp.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
-declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
-
-; SI-LABEL: {{^}}test_convert_fp16_to_fp32:
-; SI: buffer_load_ushort [[VAL:v[0-9]+]]
-; SI: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[RESULT]]
-define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
-  %val = load i16, i16 addrspace(1)* %in, align 2
-  %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
-
-
-; SI-LABEL: {{^}}test_convert_fp16_to_fp64:
-; SI: buffer_load_ushort [[VAL:v[0-9]+]]
-; SI: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
-; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
-  %val = load i16, i16 addrspace(1)* %in, align 2
-  %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
-  store double %cvt, double addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/fp32_to_fp16.ll b/test/CodeGen/R600/fp32_to_fp16.ll
deleted file mode 100644
index 67925ebd82b..00000000000
--- a/test/CodeGen/R600/fp32_to_fp16.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
-
-; SI-LABEL: {{^}}test_convert_fp32_to_fp16:
-; SI: buffer_load_dword [[VAL:v[0-9]+]]
-; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_short [[RESULT]]
-define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %val = load float, float addrspace(1)* %in, align 4
-  %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
-  store i16 %cvt, i16 addrspace(1)* %out, align 2
-  ret void
-}
diff --git a/test/CodeGen/R600/fp_to_sint.f64.ll b/test/CodeGen/R600/fp_to_sint.f64.ll
deleted file mode 100644
index 12df6606e8f..00000000000
--- a/test/CodeGen/R600/fp_to_sint.f64.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; FUNC-LABEL: @fp_to_sint_f64_i32
-; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
-  %result = fptosi double %in to i32
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: @fp_to_sint_v2f64_v2i32
-; SI: v_cvt_i32_f64_e32
-; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
-  %result = fptosi <2 x double> %in to <2 x i32>
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: @fp_to_sint_v4f64_v4i32
-; SI: v_cvt_i32_f64_e32
-; SI: v_cvt_i32_f64_e32
-; SI: v_cvt_i32_f64_e32
-; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
-  %result = fptosi <4 x double> %in to <4 x i32>
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: @fp_to_sint_i64_f64
-; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
-; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]]
-; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}}
-; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000
-
-; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}}
-; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]]
-
-; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000
-
-; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]]
-; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
-; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
-; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %val = load double, double addrspace(1)* %gep, align 8
-  %cast = fptosi double %val to i64
-  store i64 %cast, i64 addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
deleted file mode 100644
index 301a94b4904..00000000000
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ /dev/null
@@ -1,230 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-
-declare float @llvm.fabs.f32(float) #0
-
-; FUNC-LABEL: {{^}}fp_to_sint_i32:
-; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; SI: v_cvt_i32_f32_e32
-; SI: s_endpgm
-define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
-  %conv = fptosi float %in to i32
-  store i32 %conv, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs:
-; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
-define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
-  %in.fabs = call float @llvm.fabs.f32(float %in) #0
-  %conv = fptosi float %in.fabs to i32
-  store i32 %conv, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fp_to_sint_v2i32:
-; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; SI: v_cvt_i32_f32_e32
-; SI: v_cvt_i32_f32_e32
-define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
-  %result = fptosi <2 x float> %in to <2 x i32>
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fp_to_sint_v4i32:
-; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}}
-; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; SI: v_cvt_i32_f32_e32
-; SI: v_cvt_i32_f32_e32
-; SI: v_cvt_i32_f32_e32
-; SI: v_cvt_i32_f32_e32
-define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %value = load <4 x float>, <4 x float> addrspace(1) * %in
-  %result = fptosi <4 x float> %value to <4 x i32>
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fp_to_sint_i64:
-
-; EG-DAG: AND_INT
-; EG-DAG: LSHR
-; EG-DAG: SUB_INT
-; EG-DAG: AND_INT
-; EG-DAG: ASHR
-; EG-DAG: AND_INT
-; EG-DAG: OR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: LSHL
-; EG-DAG: LSHL
-; EG-DAG: SUB_INT
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT
-; EG-DAG: SETGT_INT
-; EG-DAG: XOR_INT
-; EG-DAG: XOR_INT
-; EG: SUB_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-
-; Check that the compiler doesn't crash with a "cannot select" error
-; SI: s_endpgm
-define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fptosi float %in to i64
-  store i64 %0, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC: {{^}}fp_to_sint_v2i64:
-; EG-DAG: AND_INT
-; EG-DAG: LSHR
-; EG-DAG: SUB_INT
-; EG-DAG: AND_INT
-; EG-DAG: ASHR
-; EG-DAG: AND_INT
-; EG-DAG: OR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: LSHL
-; EG-DAG: LSHL
-; EG-DAG: SUB_INT
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT
-; EG-DAG: SETGT_INT
-; EG-DAG: XOR_INT
-; EG-DAG: XOR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: AND_INT
-; EG-DAG: LSHR
-; EG-DAG: SUB_INT
-; EG-DAG: AND_INT
-; EG-DAG: ASHR
-; EG-DAG: AND_INT
-; EG-DAG: OR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: LSHL
-; EG-DAG: LSHL
-; EG-DAG: SUB_INT
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT
-; EG-DAG: SETGT_INT
-; EG-DAG: XOR_INT
-; EG-DAG: XOR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-
-; SI: s_endpgm
-define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
-  %conv = fptosi <2 x float> %x to <2 x i64>
-  store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC: {{^}}fp_to_sint_v4i64:
-; EG-DAG: AND_INT
-; EG-DAG: LSHR
-; EG-DAG: SUB_INT
-; EG-DAG: AND_INT
-; EG-DAG: ASHR
-; EG-DAG: AND_INT
-; EG-DAG: OR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: LSHL
-; EG-DAG: LSHL
-; EG-DAG: SUB_INT
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT
-; EG-DAG: SETGT_INT
-; EG-DAG: XOR_INT
-; EG-DAG: XOR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: AND_INT
-; EG-DAG: LSHR
-; EG-DAG: SUB_INT
-; EG-DAG: AND_INT
-; EG-DAG: ASHR
-; EG-DAG: AND_INT
-; EG-DAG: OR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: LSHL
-; EG-DAG: LSHL
-; EG-DAG: SUB_INT
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT
-; EG-DAG: SETGT_INT
-; EG-DAG: XOR_INT
-; EG-DAG: XOR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: AND_INT
-; EG-DAG: LSHR
-; EG-DAG: SUB_INT
-; EG-DAG: AND_INT
-; EG-DAG: ASHR
-; EG-DAG: AND_INT
-; EG-DAG: OR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: LSHL
-; EG-DAG: LSHL
-; EG-DAG: SUB_INT
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT
-; EG-DAG: SETGT_INT
-; EG-DAG: XOR_INT
-; EG-DAG: XOR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: AND_INT
-; EG-DAG: LSHR
-; EG-DAG: SUB_INT
-; EG-DAG: AND_INT
-; EG-DAG: ASHR
-; EG-DAG: AND_INT
-; EG-DAG: OR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: LSHL
-; EG-DAG: LSHL
-; EG-DAG: SUB_INT
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT
-; EG-DAG: SETGT_INT
-; EG-DAG: XOR_INT
-; EG-DAG: XOR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-
-; SI: s_endpgm
-define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
-  %conv = fptosi <4 x float> %x to <4 x i64>
-  store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fp_to_uint.f64.ll b/test/CodeGen/R600/fp_to_uint.f64.ll
deleted file mode 100644
index 41bc2a78001..00000000000
--- a/test/CodeGen/R600/fp_to_uint.f64.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; SI-LABEL: {{^}}fp_to_uint_i32_f64:
-; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
-  %cast = fptoui double %in to i32
-  store i32 %cast, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: @fp_to_uint_v2i32_v2f64
-; SI: v_cvt_u32_f64_e32
-; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
-  %cast = fptoui <2 x double> %in to <2 x i32>
-  store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: @fp_to_uint_v4i32_v4f64
-; SI: v_cvt_u32_f64_e32
-; SI: v_cvt_u32_f64_e32
-; SI: v_cvt_u32_f64_e32
-; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
-  %cast = fptoui <4 x double> %in to <4 x i32>
-  store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: @fp_to_uint_i64_f64
-; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
-; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]]
-; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}}
-; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000
-
-; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}}
-; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]]
-
-; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000
-
-; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]]
-; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
-; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
-; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %val = load double, double addrspace(1)* %gep, align 8
-  %cast = fptoui double %val to i64
-  store i64 %cast, i64 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: @fp_to_uint_v2i64_v2f64
-define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) {
-  %cast = fptoui <2 x double> %in to <2 x i64>
-  store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: @fp_to_uint_v4i64_v4f64
-define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) {
-  %cast = fptoui <4 x double> %in to <4 x i64>
-  store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32
-  ret void
-}
diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll
deleted file mode 100644
index b7b6ccc238b..00000000000
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ /dev/null
@@ -1,217 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-
-; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i32:
-; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-
-; SI: v_cvt_u32_f32_e32
-; SI: s_endpgm
-define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
-  %conv = fptoui float %in to i32
-  store i32 %conv, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fp_to_uint_v2f32_to_v2i32:
-; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_cvt_u32_f32_e32
-; SI: v_cvt_u32_f32_e32
-define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
-  %result = fptoui <2 x float> %in to <2 x i32>
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fp_to_uint_v4f32_to_v4i32:
-; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; SI: v_cvt_u32_f32_e32
-; SI: v_cvt_u32_f32_e32
-; SI: v_cvt_u32_f32_e32
-; SI: v_cvt_u32_f32_e32
-
-define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %value = load <4 x float>, <4 x float> addrspace(1) * %in
-  %result = fptoui <4 x float> %value to <4 x i32>
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC: {{^}}fp_to_uint_f32_to_i64:
-; EG-DAG: AND_INT
-; EG-DAG: LSHR
-; EG-DAG: SUB_INT
-; EG-DAG: AND_INT
-; EG-DAG: ASHR
-; EG-DAG: AND_INT
-; EG-DAG: OR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: LSHL
-; EG-DAG: LSHL
-; EG-DAG: SUB_INT
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT
-; EG-DAG: SETGT_INT
-; EG-DAG: XOR_INT
-; EG-DAG: XOR_INT
-; EG: SUB_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-
-; SI: s_endpgm
-define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
-  %conv = fptoui float %x to i64
-  store i64 %conv, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC: {{^}}fp_to_uint_v2f32_to_v2i64:
-; EG-DAG: AND_INT
-; EG-DAG: LSHR
-; EG-DAG: SUB_INT
-; EG-DAG: AND_INT
-; EG-DAG: ASHR
-; EG-DAG: AND_INT
-; EG-DAG: OR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: LSHL
-; EG-DAG: LSHL
-; EG-DAG: SUB_INT
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT
-; EG-DAG: SETGT_INT
-; EG-DAG: XOR_INT
-; EG-DAG: XOR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: AND_INT
-; EG-DAG: LSHR
-; EG-DAG: SUB_INT
-; EG-DAG: AND_INT
-; EG-DAG: ASHR
-; EG-DAG: AND_INT
-; EG-DAG: OR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: LSHL
-; EG-DAG: LSHL
-; EG-DAG: SUB_INT
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT
-; EG-DAG: SETGT_INT
-; EG-DAG: XOR_INT
-; EG-DAG: XOR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-
-; SI: s_endpgm
-define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
-  %conv = fptoui <2 x float> %x to <2 x i64>
-  store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC: {{^}}fp_to_uint_v4f32_to_v4i64:
-; EG-DAG: AND_INT
-; EG-DAG: LSHR
-; EG-DAG: SUB_INT
-; EG-DAG: AND_INT
-; EG-DAG: ASHR
-; EG-DAG: AND_INT
-; EG-DAG: OR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: LSHL
-; EG-DAG: LSHL
-; EG-DAG: SUB_INT
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT
-; EG-DAG: SETGT_INT
-; EG-DAG: XOR_INT
-; EG-DAG: XOR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: AND_INT
-; EG-DAG: LSHR
-; EG-DAG: SUB_INT
-; EG-DAG: AND_INT
-; EG-DAG: ASHR
-; EG-DAG: AND_INT
-; EG-DAG: OR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: LSHL
-; EG-DAG: LSHL
-; EG-DAG: SUB_INT
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT
-; EG-DAG: SETGT_INT
-; EG-DAG: XOR_INT
-; EG-DAG: XOR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: AND_INT
-; EG-DAG: LSHR
-; EG-DAG: SUB_INT
-; EG-DAG: AND_INT
-; EG-DAG: ASHR
-; EG-DAG: AND_INT
-; EG-DAG: OR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: LSHL
-; EG-DAG: LSHL
-; EG-DAG: SUB_INT
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT
-; EG-DAG: SETGT_INT
-; EG-DAG: XOR_INT
-; EG-DAG: XOR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: AND_INT
-; EG-DAG: LSHR
-; EG-DAG: SUB_INT
-; EG-DAG: AND_INT
-; EG-DAG: ASHR
-; EG-DAG: AND_INT
-; EG-DAG: OR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: LSHL
-; EG-DAG: LSHL
-; EG-DAG: SUB_INT
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT
-; EG-DAG: SETGT_INT
-; EG-DAG: XOR_INT
-; EG-DAG: XOR_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-
-; SI: s_endpgm
-define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
-  %conv = fptoui <4 x float> %x to <4 x i64>
-  store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fpext.ll b/test/CodeGen/R600/fpext.ll
deleted file mode 100644
index 734a43be229..00000000000
--- a/test/CodeGen/R600/fpext.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}fpext_f32_to_f64:
-; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
-  %result = fpext float %in to double
-  store double %result, double addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64:
-; SI: v_cvt_f64_f32_e32
-; SI: v_cvt_f64_f32_e32
-define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) {
-  %result = fpext <2 x float> %in to <2 x double>
-  store <2 x double> %result, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fpext_v4f32_to_v4f64:
-; SI: v_cvt_f64_f32_e32
-; SI: v_cvt_f64_f32_e32
-; SI: v_cvt_f64_f32_e32
-; SI: v_cvt_f64_f32_e32
-define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) {
-  %result = fpext <4 x float> %in to <4 x double>
-  store <4 x double> %result, <4 x double> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fpext_v8f32_to_v8f64:
-; SI: v_cvt_f64_f32_e32
-; SI: v_cvt_f64_f32_e32
-; SI: v_cvt_f64_f32_e32
-; SI: v_cvt_f64_f32_e32
-; SI: v_cvt_f64_f32_e32
-; SI: v_cvt_f64_f32_e32
-; SI: v_cvt_f64_f32_e32
-; SI: v_cvt_f64_f32_e32
-define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) {
-  %result = fpext <8 x float> %in to <8 x double>
-  store <8 x double> %result, <8 x double> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fptrunc.ll b/test/CodeGen/R600/fptrunc.ll
deleted file mode 100644
index 385e10e7baa..00000000000
--- a/test/CodeGen/R600/fptrunc.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}fptrunc_f64_to_f32:
-; SI: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
-  %result = fptrunc double %in to float
-  store float %result, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32:
-; SI: v_cvt_f32_f64_e32
-; SI: v_cvt_f32_f64_e32
-define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
-  %result = fptrunc <2 x double> %in to <2 x float>
-  store <2 x float> %result, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fptrunc_v4f64_to_v4f32:
-; SI: v_cvt_f32_f64_e32
-; SI: v_cvt_f32_f64_e32
-; SI: v_cvt_f32_f64_e32
-; SI: v_cvt_f32_f64_e32
-define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
-  %result = fptrunc <4 x double> %in to <4 x float>
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fptrunc_v8f64_to_v8f32:
-; SI: v_cvt_f32_f64_e32
-; SI: v_cvt_f32_f64_e32
-; SI: v_cvt_f32_f64_e32
-; SI: v_cvt_f32_f64_e32
-; SI: v_cvt_f32_f64_e32
-; SI: v_cvt_f32_f64_e32
-; SI: v_cvt_f32_f64_e32
-; SI: v_cvt_f32_f64_e32
-define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
-  %result = fptrunc <8 x double> %in to <8 x float>
-  store <8 x float> %result, <8 x float> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/frem.ll b/test/CodeGen/R600/frem.ll
deleted file mode 100644
index f245ef08cb9..00000000000
--- a/test/CodeGen/R600/frem.ll
+++ /dev/null
@@ -1,112 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}frem_f32:
-; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
-; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
-; GCN-DAG: v_cmp
-; GCN-DAG: v_mul_f32
-; GCN: v_rcp_f32_e32
-; GCN: v_mul_f32_e32
-; GCN: v_mul_f32_e32
-; GCN: v_trunc_f32_e32
-; GCN: v_mad_f32
-; GCN: s_endpgm
-define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
-                      float addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
-   %r0 = load float, float addrspace(1)* %in1, align 4
-   %r1 = load float, float addrspace(1)* %gep2, align 4
-   %r2 = frem float %r0, %r1
-   store float %r2, float addrspace(1)* %out, align 4
-   ret void
-}
-
-; FUNC-LABEL: {{^}}unsafe_frem_f32:
-; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
-; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}}
-; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
-; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
-; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
-; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
-; GCN: buffer_store_dword [[RESULT]]
-; GCN: s_endpgm
-define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
-                             float addrspace(1)* %in2) #1 {
-   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
-   %r0 = load float, float addrspace(1)* %in1, align 4
-   %r1 = load float, float addrspace(1)* %gep2, align 4
-   %r2 = frem float %r0, %r1
-   store float %r2, float addrspace(1)* %out, align 4
-   ret void
-}
-
-; FUNC-LABEL: {{^}}frem_f64:
-; GCN: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0
-; GCN: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0
-; GCN-DAG: v_div_fmas_f64
-; GCN-DAG: v_div_scale_f64
-; GCN-DAG: v_mul_f64
-; CI: v_trunc_f64_e32
-; CI: v_mul_f64
-; GCN: v_add_f64
-; GCN: buffer_store_dwordx2
-; GCN: s_endpgm
-define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                      double addrspace(1)* %in2) #0 {
-   %r0 = load double, double addrspace(1)* %in1, align 8
-   %r1 = load double, double addrspace(1)* %in2, align 8
-   %r2 = frem double %r0, %r1
-   store double %r2, double addrspace(1)* %out, align 8
-   ret void
-}
-
-; FUNC-LABEL: {{^}}unsafe_frem_f64:
-; GCN: v_rcp_f64_e32
-; GCN: v_mul_f64
-; SI: v_bfe_u32
-; CI: v_trunc_f64_e32
-; GCN: v_fma_f64
-; GCN: s_endpgm
-define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                             double addrspace(1)* %in2) #1 {
-   %r0 = load double, double addrspace(1)* %in1, align 8
-   %r1 = load double, double addrspace(1)* %in2, align 8
-   %r2 = frem double %r0, %r1
-   store double %r2, double addrspace(1)* %out, align 8
-   ret void
-}
-
-define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
-                        <2 x float> addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
-   %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
-   %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8
-   %r2 = frem <2 x float> %r0, %r1
-   store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8
-   ret void
-}
-
-define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
-                        <4 x float> addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
-   %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
-   %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16
-   %r2 = frem <4 x float> %r0, %r1
-   store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16
-   ret void
-}
-
-define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
-                        <2 x double> addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
-   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
-   %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16
-   %r2 = frem <2 x double> %r0, %r1
-   store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16
-   ret void
-}
-
-attributes #0 = { nounwind "unsafe-fp-math"="false" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/R600/fsqrt.ll b/test/CodeGen/R600/fsqrt.ll
deleted file mode 100644
index 04101346cdf..00000000000
--- a/test/CodeGen/R600/fsqrt.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s
-
-; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
-
-; CHECK: {{^}}fsqrt_f32:
-; CHECK: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
-
-define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-   %r0 = load float, float addrspace(1)* %in
-   %r1 = call float @llvm.sqrt.f32(float %r0)
-   store float %r1, float addrspace(1)* %out
-   ret void
-}
-
-; CHECK: {{^}}fsqrt_f64:
-; CHECK: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-
-define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-   %r0 = load double, double addrspace(1)* %in
-   %r1 = call double @llvm.sqrt.f64(double %r0)
-   store double %r1, double addrspace(1)* %out
-   ret void
-}
-
-declare float @llvm.sqrt.f32(float %Val)
-declare double @llvm.sqrt.f64(double %Val)
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
deleted file mode 100644
index dfe41cb5b11..00000000000
--- a/test/CodeGen/R600/fsub.ll
+++ /dev/null
@@ -1,75 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-
-; FUNC-LABEL: {{^}}v_fsub_f32:
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
-  %a = load float, float addrspace(1)* %in, align 4
-  %b = load float, float addrspace(1)* %b_ptr, align 4
-  %result = fsub float %a, %b
-  store float %result, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_fsub_f32:
-; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W
-
-; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
-  %sub = fsub float %a, %b
-  store float %sub, float addrspace(1)* %out, align 4
-  ret void
-}
-
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
-; FUNC-LABEL: {{^}}fsub_v2f32:
-; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
-; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
-
-; FIXME: Should be using SGPR directly for first operand
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
-  %sub = fsub <2 x float> %a, %b
-  store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_fsub_v4f32:
-; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
-  %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
-  %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
-  %result = fsub <4 x float> %a, %b
-  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FIXME: Should be using SGPR directly for first operand
-
-; FUNC-LABEL: {{^}}s_fsub_v4f32:
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: s_endpgm
-define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
-  %result = fsub <4 x float> %a, %b
-  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
diff --git a/test/CodeGen/R600/fsub64.ll b/test/CodeGen/R600/fsub64.ll
deleted file mode 100644
index f34a48e30a8..00000000000
--- a/test/CodeGen/R600/fsub64.ll
+++ /dev/null
@@ -1,107 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare double @llvm.fabs.f64(double) #0
-
-; SI-LABEL: {{^}}fsub_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                      double addrspace(1)* %in2) {
-  %r0 = load double, double addrspace(1)* %in1
-  %r1 = load double, double addrspace(1)* %in2
-  %r2 = fsub double %r0, %r1
-  store double %r2, double addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}fsub_fabs_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
-define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                           double addrspace(1)* %in2) {
-  %r0 = load double, double addrspace(1)* %in1
-  %r1 = load double, double addrspace(1)* %in2
-  %r1.fabs = call double @llvm.fabs.f64(double %r1) #0
-  %r2 = fsub double %r0, %r1.fabs
-  store double %r2, double addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}fsub_fabs_inv_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                               double addrspace(1)* %in2) {
-  %r0 = load double, double addrspace(1)* %in1
-  %r1 = load double, double addrspace(1)* %in2
-  %r0.fabs = call double @llvm.fabs.f64(double %r0) #0
-  %r2 = fsub double %r0.fabs, %r1
-  store double %r2, double addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_fsub_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
-  %sub = fsub double %a, %b
-  store double %sub, double addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_fsub_imm_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], 4.0, -s\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
-  %sub = fsub double 4.0, %a
-  store double %sub, double addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_fsub_imm_inv_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -4.0, s\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
-  %sub = fsub double %a, 4.0
-  store double %sub, double addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_fsub_self_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
-  %sub = fsub double %a, %a
-  store double %sub, double addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}fsub_v2f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) {
-  %sub = fsub <2 x double> %a, %b
-  store <2 x double> %sub, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}fsub_v4f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
-  %a = load <4 x double>, <4 x double> addrspace(1)* %in
-  %b = load <4 x double>, <4 x double> addrspace(1)* %b_ptr
-  %result = fsub <4 x double> %a, %b
-  store <4 x double> %result, <4 x double> addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_fsub_v4f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) {
-  %result = fsub <4 x double> %a, %b
-  store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/ftrunc.f64.ll b/test/CodeGen/R600/ftrunc.f64.ll
deleted file mode 100644
index 6618d8b5e57..00000000000
--- a/test/CodeGen/R600/ftrunc.f64.ll
+++ /dev/null
@@ -1,111 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-
-declare double @llvm.trunc.f64(double) nounwind readnone
-declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
-declare <3 x double> @llvm.trunc.v3f64(<3 x double>) nounwind readnone
-declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone
-declare <8 x double> @llvm.trunc.v8f64(<8 x double>) nounwind readnone
-declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone
-
-; FUNC-LABEL: {{^}}v_ftrunc_f64:
-; CI: v_trunc_f64
-; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11
-; SI: s_endpgm
-define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-  %x = load double, double addrspace(1)* %in, align 8
-  %y = call double @llvm.trunc.f64(double %x) nounwind readnone
-  store double %y, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}ftrunc_f64:
-; CI: v_trunc_f64_e32
-
-; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
-; SI: s_lshr_b64
-; SI: s_not_b64
-; SI: s_and_b64
-; SI: cmp_gt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI: cmp_lt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI: s_endpgm
-define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
-  %y = call double @llvm.trunc.f64(double %x) nounwind readnone
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}ftrunc_v2f64:
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
-  %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone
-  store <2 x double> %y, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f64:
-; FIXME-CI: v_trunc_f64_e32
-; FIXME-CI: v_trunc_f64_e32
-; FIXME-CI: v_trunc_f64_e32
-; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
-;   %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone
-;   store <3 x double> %y, <3 x double> addrspace(1)* %out
-;   ret void
-; }
-
-; FUNC-LABEL: {{^}}ftrunc_v4f64:
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
-  %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone
-  store <4 x double> %y, <4 x double> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}ftrunc_v8f64:
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
-  %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone
-  store <8 x double> %y, <8 x double> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}ftrunc_v16f64:
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-; CI: v_trunc_f64_e32
-define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
-  %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone
-  store <16 x double> %y, <16 x double> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/ftrunc.ll b/test/CodeGen/R600/ftrunc.ll
deleted file mode 100644
index edc08609a8a..00000000000
--- a/test/CodeGen/R600/ftrunc.ll
+++ /dev/null
@@ -1,120 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
-
-declare float @llvm.trunc.f32(float) nounwind readnone
-declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
-declare <3 x float> @llvm.trunc.v3f32(<3 x float>) nounwind readnone
-declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone
-declare <8 x float> @llvm.trunc.v8f32(<8 x float>) nounwind readnone
-declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone
-
-; FUNC-LABEL: {{^}}ftrunc_f32:
-; EG: TRUNC
-; SI: v_trunc_f32_e32
-define void @ftrunc_f32(float addrspace(1)* %out, float %x) {
-  %y = call float @llvm.trunc.f32(float %x) nounwind readnone
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}ftrunc_v2f32:
-; EG: TRUNC
-; EG: TRUNC
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
-  %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone
-  store <2 x float> %y, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f32:
-; FIXME-EG: TRUNC
-; FIXME-EG: TRUNC
-; FIXME-EG: TRUNC
-; FIXME-SI: v_trunc_f32_e32
-; FIXME-SI: v_trunc_f32_e32
-; FIXME-SI: v_trunc_f32_e32
-; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
-;   %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone
-;   store <3 x float> %y, <3 x float> addrspace(1)* %out
-;   ret void
-; }
-
-; FUNC-LABEL: {{^}}ftrunc_v4f32:
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
-  %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone
-  store <4 x float> %y, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}ftrunc_v8f32:
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
-  %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone
-  store <8 x float> %y, <8 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}ftrunc_v16f32:
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; EG: TRUNC
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_trunc_f32_e32
-define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
-  %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone
-  store <16 x float> %y, <16 x float> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/gep-address-space.ll b/test/CodeGen/R600/gep-address-space.ll
deleted file mode 100644
index 471b0f6b13e..00000000000
--- a/test/CodeGen/R600/gep-address-space.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
-
-define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
-; CHECK-LABEL: {{^}}use_gep_address_space:
-; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
-; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64
-  %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16
-  store i32 99, i32 addrspace(3)* %p
-  ret void
-}
-
-define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
-; CHECK-LABEL: {{^}}use_gep_address_space_large_offset:
-; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
-; SI, which is why it is being OR'd with the base pointer.
-; SI: s_or_b32
-; CI: s_add_i32
-; CHECK: ds_write_b32
-  %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
-  store i32 99, i32 addrspace(3)* %p
-  ret void
-}
-
-define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
-; CHECK-LABEL: {{^}}gep_as_vector_v4:
-; CHECK: s_add_i32
-; CHECK: s_add_i32
-; CHECK: s_add_i32
-; CHECK: s_add_i32
-  %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
-  %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
-  %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
-  %p2 = extractelement <4 x i32 addrspace(3)*> %p, i32 2
-  %p3 = extractelement <4 x i32 addrspace(3)*> %p, i32 3
-  store i32 99, i32 addrspace(3)* %p0
-  store i32 99, i32 addrspace(3)* %p1
-  store i32 99, i32 addrspace(3)* %p2
-  store i32 99, i32 addrspace(3)* %p3
-  ret void
-}
-
-define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
-; CHECK-LABEL: {{^}}gep_as_vector_v2:
-; CHECK: s_add_i32
-; CHECK: s_add_i32
-  %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
-  %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
-  %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
-  store i32 99, i32 addrspace(3)* %p0
-  store i32 99, i32 addrspace(3)* %p1
-  ret void
-}
-
diff --git a/test/CodeGen/R600/global-directive.ll b/test/CodeGen/R600/global-directive.ll
deleted file mode 100644
index be775cf9292..00000000000
--- a/test/CodeGen/R600/global-directive.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; Make sure the GlobalDirective isn't merged with the function name
-
-; SI:	.globl	foo
-; SI: {{^}}foo:
-define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
-  %result = add i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/global-extload-i1.ll b/test/CodeGen/R600/global-extload-i1.ll
deleted file mode 100644
index bd9557d730f..00000000000
--- a/test/CodeGen/R600/global-extload-i1.ll
+++ /dev/null
@@ -1,302 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; FIXME: Evergreen broken
-
-; FUNC-LABEL: {{^}}zextload_global_i1_to_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @zextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %a = load i1, i1 addrspace(1)* %in
-  %ext = zext i1 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i1_to_i32:
-; SI: buffer_load_ubyte
-; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @sextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %a = load i1, i1 addrspace(1)* %in
-  %ext = sext i1 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i32:
-; SI: s_endpgm
-define void @zextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
-  %ext = zext <1 x i1> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i32:
-; SI: s_endpgm
-define void @sextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
-  %ext = sext <1 x i1> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i32:
-; SI: s_endpgm
-define void @zextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
-  %ext = zext <2 x i1> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i32:
-; SI: s_endpgm
-define void @sextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
-  %ext = sext <2 x i1> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i32:
-; SI: s_endpgm
-define void @zextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
-  %ext = zext <4 x i1> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i32:
-; SI: s_endpgm
-define void @sextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
-  %ext = sext <4 x i1> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i32:
-; SI: s_endpgm
-define void @zextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
-  %ext = zext <8 x i1> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i32:
-; SI: s_endpgm
-define void @sextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
-  %ext = sext <8 x i1> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i32:
-; SI: s_endpgm
-define void @zextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
-  %ext = zext <16 x i1> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i32:
-; SI: s_endpgm
-define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
-  %ext = sext <16 x i1> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i32:
-; XSI: s_endpgm
-; define void @zextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
-;   %ext = zext <32 x i1> %load to <32 x i32>
-;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i32:
-; XSI: s_endpgm
-; define void @sextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
-;   %ext = sext <32 x i1> %load to <32 x i32>
-;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i32:
-; XSI: s_endpgm
-; define void @zextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
-;   %ext = zext <64 x i1> %load to <64 x i32>
-;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i32:
-; XSI: s_endpgm
-; define void @sextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
-;   %ext = sext <64 x i1> %load to <64 x i32>
-;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; FUNC-LABEL: {{^}}zextload_global_i1_to_i64:
-; SI: buffer_load_ubyte [[LOAD:v[0-9]+]],
-; SI: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
-; SI: buffer_store_dwordx2
-define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %a = load i1, i1 addrspace(1)* %in
-  %ext = zext i1 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i1_to_i64:
-; SI: buffer_load_ubyte [[LOAD:v[0-9]+]],
-; SI: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %a = load i1, i1 addrspace(1)* %in
-  %ext = sext i1 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i64:
-; SI: s_endpgm
-define void @zextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
-  %ext = zext <1 x i1> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i64:
-; SI: s_endpgm
-define void @sextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
-  %ext = sext <1 x i1> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i64:
-; SI: s_endpgm
-define void @zextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
-  %ext = zext <2 x i1> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i64:
-; SI: s_endpgm
-define void @sextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
-  %ext = sext <2 x i1> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i64:
-; SI: s_endpgm
-define void @zextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
-  %ext = zext <4 x i1> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i64:
-; SI: s_endpgm
-define void @sextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
-  %ext = sext <4 x i1> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i64:
-; SI: s_endpgm
-define void @zextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
-  %ext = zext <8 x i1> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i64:
-; SI: s_endpgm
-define void @sextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
-  %ext = sext <8 x i1> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i64:
-; SI: s_endpgm
-define void @zextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
-  %ext = zext <16 x i1> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i64:
-; SI: s_endpgm
-define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
-  %ext = sext <16 x i1> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i64:
-; XSI: s_endpgm
-; define void @zextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
-;   %ext = zext <32 x i1> %load to <32 x i64>
-;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i64:
-; XSI: s_endpgm
-; define void @sextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
-;   %ext = sext <32 x i1> %load to <32 x i64>
-;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i64:
-; XSI: s_endpgm
-; define void @zextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
-;   %ext = zext <64 x i1> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i64:
-; XSI: s_endpgm
-; define void @sextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
-;   %ext = sext <64 x i1> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-;   ret void
-; }
diff --git a/test/CodeGen/R600/global-extload-i16.ll b/test/CodeGen/R600/global-extload-i16.ll
deleted file mode 100644
index 103a40dee27..00000000000
--- a/test/CodeGen/R600/global-extload-i16.ll
+++ /dev/null
@@ -1,302 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
-
-; FUNC-LABEL: {{^}}zextload_global_i16_to_i32:
-; SI: buffer_load_ushort
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16, i16 addrspace(1)* %in
-  %ext = zext i16 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i16_to_i32:
-; SI: buffer_load_sshort
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16, i16 addrspace(1)* %in
-  %ext = sext i16 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32:
-; SI: buffer_load_ushort
-; SI: s_endpgm
-define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
-  %ext = zext <1 x i16> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32:
-; SI: buffer_load_sshort
-; SI: s_endpgm
-define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
-  %ext = sext <1 x i16> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32:
-; SI: s_endpgm
-define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
-  %ext = zext <2 x i16> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32:
-; SI: s_endpgm
-define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
-  %ext = sext <2 x i16> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32:
-; SI: s_endpgm
-define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
-  %ext = zext <4 x i16> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32:
-; SI: s_endpgm
-define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
-  %ext = sext <4 x i16> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32:
-; SI: s_endpgm
-define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
-  %ext = zext <8 x i16> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32:
-; SI: s_endpgm
-define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
-  %ext = sext <8 x i16> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32:
-; SI: s_endpgm
-define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
-  %ext = zext <16 x i16> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32:
-; SI: s_endpgm
-define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
-  %ext = sext <16 x i16> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32:
-; SI: s_endpgm
-define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
-  %ext = zext <32 x i16> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32:
-; SI: s_endpgm
-define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
-  %ext = sext <32 x i16> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32:
-; SI: s_endpgm
-define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
-  %ext = zext <64 x i16> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32:
-; SI: s_endpgm
-define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
-  %ext = sext <64 x i16> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
-; SI: buffer_load_ushort v[[LO:[0-9]+]],
-; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16, i16 addrspace(1)* %in
-  %ext = zext i16 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i16_to_i64:
-; SI: buffer_load_sshort [[LOAD:v[0-9]+]],
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16, i16 addrspace(1)* %in
-  %ext = sext i16 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64:
-; SI: s_endpgm
-define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
-  %ext = zext <1 x i16> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64:
-; SI: s_endpgm
-define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
-  %ext = sext <1 x i16> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64:
-; SI: s_endpgm
-define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
-  %ext = zext <2 x i16> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64:
-; SI: s_endpgm
-define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
-  %ext = sext <2 x i16> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64:
-; SI: s_endpgm
-define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
-  %ext = zext <4 x i16> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64:
-; SI: s_endpgm
-define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
-  %ext = sext <4 x i16> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64:
-; SI: s_endpgm
-define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
-  %ext = zext <8 x i16> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64:
-; SI: s_endpgm
-define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
-  %ext = sext <8 x i16> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64:
-; SI: s_endpgm
-define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
-  %ext = zext <16 x i16> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64:
-; SI: s_endpgm
-define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
-  %ext = sext <16 x i16> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64:
-; SI: s_endpgm
-define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
-  %ext = zext <32 x i16> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64:
-; SI: s_endpgm
-define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
-  %ext = sext <32 x i16> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64:
-; SI: s_endpgm
-define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
-  %ext = zext <64 x i16> %load to <64 x i64>
-  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64:
-; SI: s_endpgm
-define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
-  %ext = sext <64 x i16> %load to <64 x i64>
-  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/global-extload-i32.ll b/test/CodeGen/R600/global-extload-i32.ll
deleted file mode 100644
index 79b83452939..00000000000
--- a/test/CodeGen/R600/global-extload-i32.ll
+++ /dev/null
@@ -1,457 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}zextload_global_i32_to_i64:
-; SI: buffer_load_dword v[[LO:[0-9]+]],
-; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %a = load i32, i32 addrspace(1)* %in
-  %ext = zext i32 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i32_to_i64:
-; SI: buffer_load_dword [[LOAD:v[0-9]+]],
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %a = load i32, i32 addrspace(1)* %in
-  %ext = sext i32 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i32_to_v1i64:
-; SI: buffer_load_dword
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @zextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i32>, <1 x i32> addrspace(1)* %in
-  %ext = zext <1 x i32> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i32_to_v1i64:
-; SI: buffer_load_dword
-; SI: v_ashrrev_i32
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i32>, <1 x i32> addrspace(1)* %in
-  %ext = sext <1 x i32> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i32_to_v2i64:
-; SI: buffer_load_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i32>, <2 x i32> addrspace(1)* %in
-  %ext = zext <2 x i32> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i32_to_v2i64:
-; SI: buffer_load_dwordx2
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI: s_endpgm
-define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i32>, <2 x i32> addrspace(1)* %in
-  %ext = sext <2 x i32> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i32_to_v4i64:
-; SI: buffer_load_dwordx4
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i32>, <4 x i32> addrspace(1)* %in
-  %ext = zext <4 x i32> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i32_to_v4i64:
-; SI: buffer_load_dwordx4
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI: s_endpgm
-define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i32>, <4 x i32> addrspace(1)* %in
-  %ext = sext <4 x i32> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI: s_endpgm
-define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i32>, <8 x i32> addrspace(1)* %in
-  %ext = zext <8 x i32> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI: s_endpgm
-define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i32>, <8 x i32> addrspace(1)* %in
-  %ext = sext <8 x i32> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI: s_endpgm
-define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i32>, <16 x i32> addrspace(1)* %in
-  %ext = sext <16 x i32> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-
-; SI: s_endpgm
-define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i32>, <16 x i32> addrspace(1)* %in
-  %ext = zext <16 x i32> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI: s_endpgm
-define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i32>, <32 x i32> addrspace(1)* %in
-  %ext = sext <32 x i32> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dwordx2
-
-; SI: s_endpgm
-define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i32>, <32 x i32> addrspace(1)* %in
-  %ext = zext <32 x i32> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/global-extload-i8.ll b/test/CodeGen/R600/global-extload-i8.ll
deleted file mode 100644
index b31d5361d5a..00000000000
--- a/test/CodeGen/R600/global-extload-i8.ll
+++ /dev/null
@@ -1,299 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}zextload_global_i8_to_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @zextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %a = load i8, i8 addrspace(1)* %in
-  %ext = zext i8 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i8_to_i32:
-; SI: buffer_load_sbyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @sextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %a = load i8, i8 addrspace(1)* %in
-  %ext = sext i8 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i32:
-; SI: s_endpgm
-define void @zextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
-  %ext = zext <1 x i8> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i32:
-; SI: s_endpgm
-define void @sextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
-  %ext = sext <1 x i8> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i32:
-; SI: s_endpgm
-define void @zextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
-  %ext = zext <2 x i8> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i32:
-; SI: s_endpgm
-define void @sextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
-  %ext = sext <2 x i8> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i32:
-; SI: s_endpgm
-define void @zextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
-  %ext = zext <4 x i8> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i32:
-; SI: s_endpgm
-define void @sextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
-  %ext = sext <4 x i8> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i32:
-; SI: s_endpgm
-define void @zextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
-  %ext = zext <8 x i8> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i32:
-; SI: s_endpgm
-define void @sextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
-  %ext = sext <8 x i8> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i32:
-; SI: s_endpgm
-define void @zextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
-  %ext = zext <16 x i8> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i32:
-; SI: s_endpgm
-define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
-  %ext = sext <16 x i8> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i32:
-; XSI: s_endpgm
-; define void @zextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
-;   %ext = zext <32 x i8> %load to <32 x i32>
-;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i32:
-; XSI: s_endpgm
-; define void @sextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
-;   %ext = sext <32 x i8> %load to <32 x i32>
-;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i32:
-; XSI: s_endpgm
-; define void @zextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
-;   %ext = zext <64 x i8> %load to <64 x i32>
-;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i32:
-; XSI: s_endpgm
-; define void @sextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
-;   %ext = sext <64 x i8> %load to <64 x i32>
-;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; FUNC-LABEL: {{^}}zextload_global_i8_to_i64:
-; SI: buffer_load_ubyte v[[LO:[0-9]+]],
-; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %a = load i8, i8 addrspace(1)* %in
-  %ext = zext i8 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i8_to_i64:
-; SI: buffer_load_sbyte [[LOAD:v[0-9]+]],
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %a = load i8, i8 addrspace(1)* %in
-  %ext = sext i8 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i64:
-; SI: s_endpgm
-define void @zextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
-  %ext = zext <1 x i8> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i64:
-; SI: s_endpgm
-define void @sextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
-  %ext = sext <1 x i8> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i64:
-; SI: s_endpgm
-define void @zextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
-  %ext = zext <2 x i8> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i64:
-; SI: s_endpgm
-define void @sextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
-  %ext = sext <2 x i8> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i64:
-; SI: s_endpgm
-define void @zextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
-  %ext = zext <4 x i8> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i64:
-; SI: s_endpgm
-define void @sextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
-  %ext = sext <4 x i8> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i64:
-; SI: s_endpgm
-define void @zextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
-  %ext = zext <8 x i8> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i64:
-; SI: s_endpgm
-define void @sextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
-  %ext = sext <8 x i8> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i64:
-; SI: s_endpgm
-define void @zextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
-  %ext = zext <16 x i8> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i64:
-; SI: s_endpgm
-define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
-  %ext = sext <16 x i8> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i64:
-; XSI: s_endpgm
-; define void @zextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
-;   %ext = zext <32 x i8> %load to <32 x i64>
-;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i64:
-; XSI: s_endpgm
-; define void @sextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
-;   %ext = sext <32 x i8> %load to <32 x i64>
-;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i64:
-; XSI: s_endpgm
-; define void @zextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
-;   %ext = zext <64 x i8> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i64:
-; XSI: s_endpgm
-; define void @sextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
-;   %ext = sext <64 x i8> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-;   ret void
-; }
diff --git a/test/CodeGen/R600/global-zero-initializer.ll b/test/CodeGen/R600/global-zero-initializer.ll
deleted file mode 100644
index 45aa8bf4e1d..00000000000
--- a/test/CodeGen/R600/global-zero-initializer.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
-; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
-
-; CHECK: error: unsupported initializer for address space in load_init_global_global
-
-@lds = addrspace(1) global [256 x i32] zeroinitializer
-
-define void @load_init_global_global(i32 addrspace(1)* %out, i1 %p) {
- %gep = getelementptr [256 x i32], [256 x i32] addrspace(1)* @lds, i32 0, i32 10
-  %ld = load i32, i32 addrspace(1)* %gep
-  store i32 %ld, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/global_atomics.ll b/test/CodeGen/R600/global_atomics.ll
deleted file mode 100644
index 847950f6376..00000000000
--- a/test/CodeGen/R600/global_atomics.ll
+++ /dev/null
@@ -1,801 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}atomic_add_i32_offset:
-; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset:
-; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
-; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
-; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_add_i32:
-; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0  = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_add_i32_ret:
-; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %0  = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
-; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64:
-; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_and_i32_offset:
-; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset:
-; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
-; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
-; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_and_i32:
-; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0  = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_and_i32_ret:
-; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %0  = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
-; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64:
-; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
-; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset:
-; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
-; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
-; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_sub_i32:
-; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_sub_i32_ret:
-; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
-; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64:
-; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_max_i32_offset:
-; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset:
-; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
-; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
-; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_max_i32:
-; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0  = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_max_i32_ret:
-; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %0  = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
-; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64:
-; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
-; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset:
-; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
-; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
-; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umax_i32:
-; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umax_i32_ret:
-; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
-; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64:
-; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_min_i32_offset:
-; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset:
-; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
-; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
-; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_min_i32:
-; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0  = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_min_i32_ret:
-; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %0  = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
-; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64:
-; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
-; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset:
-; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
-; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
-; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umin_i32:
-; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umin_i32_ret:
-; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
-; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64:
-; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_or_i32_offset:
-; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset:
-; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
-; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
-; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_or_i32:
-; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0  = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_or_i32_ret:
-; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %0  = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
-; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64:
-; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
-; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset:
-; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
-; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
-; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xchg_i32:
-; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xchg_i32_ret:
-; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
-; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
-; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
-; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset:
-; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
-; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
-; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xor_i32:
-; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xor_i32_ret:
-; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
-entry:
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
-; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64:
-; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
-entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
-  ret void
-}
diff --git a/test/CodeGen/R600/gv-const-addrspace-fail.ll b/test/CodeGen/R600/gv-const-addrspace-fail.ll
deleted file mode 100644
index 014b0a5482a..00000000000
--- a/test/CodeGen/R600/gv-const-addrspace-fail.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-@a = internal addrspace(2) constant [1 x i8] [ i8 7 ], align 1
-
-; FUNC-LABEL: {{^}}test_i8:
-; EG: CF_END
-; SI: buffer_store_byte
-; SI: s_endpgm
-define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 {
-  %arrayidx = getelementptr inbounds [1 x i8], [1 x i8] addrspace(2)* @a, i32 0, i32 %s
-  %1 = load i8, i8 addrspace(2)* %arrayidx, align 1
-  store i8 %1, i8 addrspace(1)* %out
-  ret void
-}
-
-@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
-
-; FUNC-LABEL: {{^}}test_i16:
-; EG: CF_END
-; SI: buffer_store_short
-; SI: s_endpgm
-define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 {
-  %arrayidx = getelementptr inbounds [1 x i16], [1 x i16] addrspace(2)* @b, i32 0, i32 %s
-  %1 = load i16, i16 addrspace(2)* %arrayidx, align 2
-  store i16 %1, i16 addrspace(1)* %out
-  ret void
-}
-
-%struct.bar = type { float, [5 x i8] }
-
-; The illegal i8s aren't handled
-@struct_bar_gv = internal addrspace(2) constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ]
-
-; FUNC-LABEL: {{^}}struct_bar_gv_load:
-define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) {
-  %gep = getelementptr inbounds [1 x %struct.bar], [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index
-  %load = load i8, i8 addrspace(2)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* %out, align 1
-  ret void
-}
-
-
-; The private load isn't scalarzied.
-@array_vector_gv = internal addrspace(2) constant [4 x <4 x i32>] [ <4 x i32> <i32 1, i32 2, i32 3, i32 4>,
-                                                                    <4 x i32> <i32 5, i32 6, i32 7, i32 8>,
-                                                                    <4 x i32> <i32 9, i32 10, i32 11, i32 12>,
-                                                                    <4 x i32> <i32 13, i32 14, i32 15, i32 16> ]
-
-; FUNC-LABEL: {{^}}array_vector_gv_load:
-define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) {
-  %gep = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index
-  %load = load <4 x i32>, <4 x i32> addrspace(2)* %gep, align 16
-  store <4 x i32> %load, <4 x i32> addrspace(1)* %out, align 16
-  ret void
-}
diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll
deleted file mode 100644
index 3c1fc6c98f7..00000000000
--- a/test/CodeGen/R600/gv-const-addrspace.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
-
-@float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
-
-; FUNC-LABEL: {{^}}float:
-; FIXME: We should be using s_load_dword here.
-; SI: buffer_load_dword
-; VI: s_load_dword
-
-; EG-DAG: MOV {{\** *}}T2.X
-; EG-DAG: MOV {{\** *}}T3.X
-; EG-DAG: MOV {{\** *}}T4.X
-; EG-DAG: MOV {{\** *}}T5.X
-; EG-DAG: MOV {{\** *}}T6.X
-; EG: MOVA_INT
-
-define void @float(float addrspace(1)* %out, i32 %index) {
-entry:
-  %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
-  %1 = load float, float addrspace(2)* %0
-  store float %1, float addrspace(1)* %out
-  ret void
-}
-
-@i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
-
-; FUNC-LABEL: {{^}}i32:
-
-; FIXME: We should be using s_load_dword here.
-; SI: buffer_load_dword
-; VI: s_load_dword
-
-; EG-DAG: MOV {{\** *}}T2.X
-; EG-DAG: MOV {{\** *}}T3.X
-; EG-DAG: MOV {{\** *}}T4.X
-; EG-DAG: MOV {{\** *}}T5.X
-; EG-DAG: MOV {{\** *}}T6.X
-; EG: MOVA_INT
-
-define void @i32(i32 addrspace(1)* %out, i32 %index) {
-entry:
-  %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-
-%struct.foo = type { float, [5 x i32] }
-
-@struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
-
-; FUNC-LABEL: {{^}}struct_foo_gv_load:
-; GCN: s_load_dword
-
-define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
-  %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
-  %load = load i32, i32 addrspace(2)* %gep, align 4
-  store i32 %load, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-@array_v1_gv = internal addrspace(2) constant [4 x <1 x i32>] [ <1 x i32> <i32 1>,
-                                                                <1 x i32> <i32 2>,
-                                                                <1 x i32> <i32 3>,
-                                                                <1 x i32> <i32 4> ]
-
-; FUNC-LABEL: {{^}}array_v1_gv_load:
-; FIXME: We should be using s_load_dword here.
-; SI: buffer_load_dword
-; VI: s_load_dword
-define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
-  %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
-  %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4
-  store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
-  ret void
-}
-
-define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
-entry:
-  %0 = icmp eq i32 0, %a
-  br i1 %0, label %if, label %else
-
-if:
-  %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
-  %2 = load float, float addrspace(2)* %1
-  store float %2, float addrspace(1)* %out
-  br label %endif
-
-else:
-  store float 1.0, float addrspace(1)* %out
-  br label %endif
-
-endif:
-  ret void
-}
diff --git a/test/CodeGen/R600/half.ll b/test/CodeGen/R600/half.ll
deleted file mode 100644
index bf8f11860b5..00000000000
--- a/test/CodeGen/R600/half.ll
+++ /dev/null
@@ -1,525 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-
-; half args should be promoted to float
-
-; GCN-LABEL: {{^}}load_f16_arg:
-; GCN: s_load_dword [[ARG:s[0-9]+]]
-; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]]
-; GCN: buffer_store_short [[CVT]]
-define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
-  store half %arg, half addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}load_v2f16_arg:
-; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
-; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
-; GCN-DAG: buffer_store_short [[V0]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_short [[V1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
-; GCN: s_endpgm
-define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
-  store <2 x half> %arg, <2 x half> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}load_v3f16_arg:
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN-NOT: buffer_load
-; GCN-DAG: buffer_store_dword
-; GCN-DAG: buffer_store_short
-; GCN-NOT: buffer_store
-; GCN: s_endpgm
-define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
-  store <3 x half> %arg, <3 x half> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}load_v4f16_arg:
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: s_endpgm
-define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
-  store <4 x half> %arg, <4 x half> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}load_v8f16_arg:
-define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
-  store <8 x half> %arg, <8 x half> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}extload_v2f16_arg:
-define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
-  %fpext = fpext <2 x half> %in to <2 x float>
-  store <2 x float> %fpext, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}extload_f16_to_f32_arg:
-define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
-  %ext = fpext half %arg to float
-  store float %ext, float addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg:
-define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
-  %ext = fpext <2 x half> %arg to <2 x float>
-  store <2 x float> %ext, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN-NOT: buffer_load
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN-NOT: v_cvt_f32_f16
-; GCN-DAG: buffer_store_dword
-; GCN-DAG: buffer_store_dwordx2
-; GCN: s_endpgm
-define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
-  %ext = fpext <3 x half> %arg to <3 x float>
-  store <3 x float> %ext, <3 x float> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg:
-define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
-  %ext = fpext <4 x half> %arg to <4 x float>
-  store <4 x float> %ext, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
-define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
-  %ext = fpext <8 x half> %arg to <8 x float>
-  store <8 x float> %ext, <8 x float> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}extload_f16_to_f64_arg:
-define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
-  %ext = fpext half %arg to double
-  store double %ext, double addrspace(1)* %out
-  ret void
-}
-; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
-define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
-  %ext = fpext <2 x half> %arg to <2 x double>
-  store <2 x double> %ext, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
-define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
-  %ext = fpext <3 x half> %arg to <3 x double>
-  store <3 x double> %ext, <3 x double> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
-define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
-  %ext = fpext <4 x half> %arg to <4 x double>
-  store <4 x double> %ext, <4 x double> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
-define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
-  %ext = fpext <8 x half> %arg to <8 x double>
-  store <8 x double> %ext, <8 x double> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_load_store_f16:
-; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
-; GCN: buffer_store_short [[TMP]]
-define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
-  %val = load half, half addrspace(1)* %in
-  store half %val, half addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_load_store_v2f16:
-; GCN: buffer_load_dword [[TMP:v[0-9]+]]
-; GCN: buffer_store_dword [[TMP]]
-define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
-  %val = load <2 x half>, <2 x half> addrspace(1)* %in
-  store <2 x half> %val, <2 x half> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_load_store_v4f16:
-; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]]
-; GCN: buffer_store_dwordx2 [[TMP]]
-define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
-  %val = load <4 x half>, <4 x half> addrspace(1)* %in
-  store <4 x half> %val, <4 x half> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_load_store_v8f16:
-; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
-; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
-; GCN: s_endpgm
-define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
-  %val = load <8 x half>, <8 x half> addrspace(1)* %in
-  store <8 x half> %val, <8 x half> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_extload_f16_to_f32:
-; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
-; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]]
-; GCN: buffer_store_dword [[CVT]]
-define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
-  %val = load half, half addrspace(1)* %in
-  %cvt = fpext half %val to float
-  store float %cvt, float addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
-define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
-  %val = load <2 x half>, <2 x half> addrspace(1)* %in
-  %cvt = fpext <2 x half> %val to <2 x float>
-  store <2 x float> %cvt, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32:
-define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
-  %val = load <3 x half>, <3 x half> addrspace(1)* %in
-  %cvt = fpext <3 x half> %val to <3 x float>
-  store <3 x float> %cvt, <3 x float> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32:
-define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
-  %val = load <4 x half>, <4 x half> addrspace(1)* %in
-  %cvt = fpext <4 x half> %val to <4 x float>
-  store <4 x float> %cvt, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32:
-define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
-  %val = load <8 x half>, <8 x half> addrspace(1)* %in
-  %cvt = fpext <8 x half> %val to <8 x float>
-  store <8 x float> %cvt, <8 x float> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32:
-define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
-  %val = load <16 x half>, <16 x half> addrspace(1)* %in
-  %cvt = fpext <16 x half> %val to <16 x float>
-  store <16 x float> %cvt, <16 x float> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_extload_f16_to_f64:
-; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
-; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]]
-; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]]
-; GCN: buffer_store_dwordx2 [[CVT1]]
-define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
-  %val = load half, half addrspace(1)* %in
-  %cvt = fpext half %val to double
-  store double %cvt, double addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64:
-define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
-  %val = load <2 x half>, <2 x half> addrspace(1)* %in
-  %cvt = fpext <2 x half> %val to <2 x double>
-  store <2 x double> %cvt, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64:
-define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
-  %val = load <3 x half>, <3 x half> addrspace(1)* %in
-  %cvt = fpext <3 x half> %val to <3 x double>
-  store <3 x double> %cvt, <3 x double> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64:
-define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
-  %val = load <4 x half>, <4 x half> addrspace(1)* %in
-  %cvt = fpext <4 x half> %val to <4 x double>
-  store <4 x double> %cvt, <4 x double> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64:
-define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
-  %val = load <8 x half>, <8 x half> addrspace(1)* %in
-  %cvt = fpext <8 x half> %val to <8 x double>
-  store <8 x double> %cvt, <8 x double> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64:
-define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
-  %val = load <16 x half>, <16 x half> addrspace(1)* %in
-  %cvt = fpext <16 x half> %val to <16 x double>
-  store <16 x double> %cvt, <16 x double> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_truncstore_f32_to_f16:
-; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
-; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]]
-; GCN: buffer_store_short [[CVT]]
-define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %val = load float, float addrspace(1)* %in
-  %cvt = fptrunc float %val to half
-  store half %cvt, half addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]]
-; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
-; GCN-DAG: buffer_store_short [[CVT0]]
-; GCN-DAG: buffer_store_short [[CVT1]]
-; GCN: s_endpgm
-define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
-  %val = load <2 x float>, <2 x float> addrspace(1)* %in
-  %cvt = fptrunc <2 x float> %val to <2 x half>
-  store <2 x half> %cvt, <2 x half> addrspace(1)* %out
-  ret void
-}
-
-; FIXME: Shouldn't do 4th conversion
-; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16:
-; GCN: buffer_load_dwordx4
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: buffer_store_short
-; GCN: buffer_store_dword
-; GCN: s_endpgm
-define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
-  %val = load <3 x float>, <3 x float> addrspace(1)* %in
-  %cvt = fptrunc <3 x float> %val to <3 x half>
-  store <3 x half> %cvt, <3 x half> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16:
-; GCN: buffer_load_dwordx4
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: s_endpgm
-define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
-  %val = load <4 x float>, <4 x float> addrspace(1)* %in
-  %cvt = fptrunc <4 x float> %val to <4 x half>
-  store <4 x half> %cvt, <4 x half> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16:
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: s_endpgm
-define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
-  %val = load <8 x float>, <8 x float> addrspace(1)* %in
-  %cvt = fptrunc <8 x float> %val to <8 x half>
-  store <8 x half> %cvt, <8 x half> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16:
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: s_endpgm
-define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
-  %val = load <16 x float>, <16 x float> addrspace(1)* %in
-  %cvt = fptrunc <16 x float> %val to <16 x half>
-  store <16 x half> %cvt, <16 x half> addrspace(1)* %out
-  ret void
-}
-
-; FIXME: Unsafe math should fold conversions away
-; GCN-LABEL: {{^}}fadd_f16:
-; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; SI: v_add_f32
-; GCN: s_endpgm
-define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
-   %add = fadd half %a, %b
-   store half %add, half addrspace(1)* %out, align 4
-   ret void
-}
-
-; GCN-LABEL: {{^}}fadd_v2f16:
-; SI: v_add_f32
-; SI: v_add_f32
-; GCN: s_endpgm
-define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
-  %add = fadd <2 x half> %a, %b
-  store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
-  ret void
-}
-
-; GCN-LABEL: {{^}}fadd_v4f16:
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-; GCN: s_endpgm
-define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
-  %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
-  %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
-  %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
-  %result = fadd <4 x half> %a, %b
-  store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16
-  ret void
-}
-
-; GCN-LABEL: {{^}}fadd_v8f16:
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-; SI: v_add_f32
-; GCN: s_endpgm
-define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
-  %add = fadd <8 x half> %a, %b
-  store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
-  ret void
-}
-
-; GCN-LABEL: {{^}}fsub_f16:
-; GCN: v_subrev_f32_e32
-; GCN: s_endpgm
-define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
-  %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1
-  %a = load half, half addrspace(1)* %in
-  %b = load half, half addrspace(1)* %b_ptr
-  %sub = fsub half %a, %b
-  store half %sub, half addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_bitcast_from_half:
-; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
-; GCN: buffer_store_short [[TMP]]
-define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
-  %val = load half, half addrspace(1)* %in
-  %val_int = bitcast half %val to i16
-  store i16 %val_int, i16 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_bitcast_to_half:
-; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
-; GCN: buffer_store_short [[TMP]]
-define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
-  %val = load i16, i16 addrspace(1)* %in
-  %val_fp = bitcast i16 %val to half
-  store half %val_fp, half addrspace(1)* %out
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/R600/hsa.ll b/test/CodeGen/R600/hsa.ll
deleted file mode 100644
index f9113399afe..00000000000
--- a/test/CodeGen/R600/hsa.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
-
-; HSA: .section        .hsa.version
-; HSA-NEXT: .ascii  "HSA Code Unit:0.0:AMD:0.1:GFX8.1:0"
-; HSA: {{^}}simple:
-; Make sure we are setting the ATC bit:
-; HSA: s_mov_b32 s[[HI:[0-9]]], 0x100f000
-; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0
-
-define void @simple(i32 addrspace(1)* %out) {
-entry:
-  store i32 0, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/i1-copy-implicit-def.ll b/test/CodeGen/R600/i1-copy-implicit-def.ll
deleted file mode 100644
index b11a2113764..00000000000
--- a/test/CodeGen/R600/i1-copy-implicit-def.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SILowerI1Copies was not handling IMPLICIT_DEF
-; SI-LABEL: {{^}}br_implicit_def:
-; SI: BB#0:
-; SI-NEXT: s_and_saveexec_b64
-; SI-NEXT: s_xor_b64
-; SI-NEXT: BB#1:
-define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
-bb:
-  br i1 undef, label %bb1, label %bb2
-
-bb1:
-  store volatile i32 123, i32 addrspace(1)* %out
-  ret void
-
-bb2:
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/R600/i1-copy-phi.ll b/test/CodeGen/R600/i1-copy-phi.ll
deleted file mode 100644
index 105cd06b330..00000000000
--- a/test/CodeGen/R600/i1-copy-phi.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}br_i1_phi:
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; SI: s_and_saveexec_b64
-; SI: s_xor_b64
-; SI: v_mov_b32_e32 [[REG]], -1{{$}}
-; SI: v_cmp_ne_i32_e32 vcc, 0, [[REG]]
-; SI: s_and_saveexec_b64
-; SI: s_xor_b64
-; SI: s_endpgm
-define void @br_i1_phi(i32 %arg, i1 %arg1) #0 {
-bb:
-  br i1 %arg1, label %bb2, label %bb3
-
-bb2:                                              ; preds = %bb
-  br label %bb3
-
-bb3:                                              ; preds = %bb2, %bb
-  %tmp = phi i1 [ true, %bb2 ], [ false, %bb ]
-  br i1 %tmp, label %bb4, label %bb6
-
-bb4:                                              ; preds = %bb3
-  %tmp5 = mul i32 undef, %arg
-  br label %bb6
-
-bb6:                                              ; preds = %bb4, %bb3
-  ret void
-}
diff --git a/test/CodeGen/R600/i8-to-double-to-float.ll b/test/CodeGen/R600/i8-to-double-to-float.ll
deleted file mode 100644
index c218e1918bb..00000000000
--- a/test/CodeGen/R600/i8-to-double-to-float.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %1 = load i8, i8 addrspace(1)* %in
-  %2 = uitofp i8 %1 to double
-  %3 = fptrunc double %2 to float
-  store float %3, float addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
deleted file mode 100644
index 60e59a5a528..00000000000
--- a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;Test that a select with reversed True/False values is correctly lowered
-;to a SETNE_INT.  There should only be one SETNE_INT instruction.
-
-;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK-NOT: SETNE_INT
-
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %0 = load i32, i32 addrspace(1)* %in
-  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %1 = load i32, i32 addrspace(1)* %arrayidx1
-  %cmp = icmp eq i32 %0, %1
-  %value = select i1 %cmp, i32 0, i32 -1
-  store i32 %value, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/icmp64.ll b/test/CodeGen/R600/icmp64.ll
deleted file mode 100644
index 0eaa33ebafe..00000000000
--- a/test/CodeGen/R600/icmp64.ll
+++ /dev/null
@@ -1,93 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}test_i64_eq:
-; SI: v_cmp_eq_i64
-define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %cmp = icmp eq i64 %a, %b
-  %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_i64_ne:
-; SI: v_cmp_ne_i64
-define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %cmp = icmp ne i64 %a, %b
-  %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_i64_slt:
-; SI: v_cmp_lt_i64
-define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %cmp = icmp slt i64 %a, %b
-  %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_i64_ult:
-; SI: v_cmp_lt_u64
-define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %cmp = icmp ult i64 %a, %b
-  %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_i64_sle:
-; SI: v_cmp_le_i64
-define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %cmp = icmp sle i64 %a, %b
-  %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_i64_ule:
-; SI: v_cmp_le_u64
-define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %cmp = icmp ule i64 %a, %b
-  %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_i64_sgt:
-; SI: v_cmp_gt_i64
-define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %cmp = icmp sgt i64 %a, %b
-  %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_i64_ugt:
-; SI: v_cmp_gt_u64
-define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %cmp = icmp ugt i64 %a, %b
-  %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_i64_sge:
-; SI: v_cmp_ge_i64
-define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %cmp = icmp sge i64 %a, %b
-  %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_i64_uge:
-; SI: v_cmp_ge_u64
-define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %cmp = icmp uge i64 %a, %b
-  %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
diff --git a/test/CodeGen/R600/imm.ll b/test/CodeGen/R600/imm.ll
deleted file mode 100644
index 12eed550eb1..00000000000
--- a/test/CodeGen/R600/imm.ll
+++ /dev/null
@@ -1,617 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=CHECK %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CHECK %s
-
-; Use a 64-bit value with lo bits that can be represented as an inline constant
-; CHECK-LABEL: {{^}}i64_imm_inline_lo:
-; CHECK: s_mov_b32 [[LO:s[0-9]+]], 5
-; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], [[LO]]
-; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]:
-define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
-entry:
-  store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005
-  ret void
-}
-
-; Use a 64-bit value with hi bits that can be represented as an inline constant
-; CHECK-LABEL: {{^}}i64_imm_inline_hi:
-; CHECK: s_mov_b32 [[HI:s[0-9]+]], 5
-; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], [[HI]]
-; CHECK: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]]
-define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
-entry:
-  store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64:
-; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
-; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
-; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
-  store i64 -9223372036854775808, i64 addrspace(1) *%out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_neg_0.0_i32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
-; CHECK: buffer_store_dword [[REG]]
-define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
-  store i32 -2147483648, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_0.0_f32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
-  store float 0.0, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_imm_neg_0.0_f32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
-; CHECK: buffer_store_dword [[REG]]
-define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
-  store float -0.0, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_0.5_f32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
-  store float 0.5, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
-  store float -0.5, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_1.0_f32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
-  store float 1.0, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
-  store float -1.0, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_2.0_f32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
-  store float 2.0, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
-  store float -2.0, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_4.0_f32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
-  store float 4.0, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
-  store float -4.0, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_literal_imm_f32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000
-; CHECK: buffer_store_dword [[REG]]
-define void @store_literal_imm_f32(float addrspace(1)* %out) {
-  store float 4096.0, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_0.0_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0, [[VAL]]{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, 0.0
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_0.5_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, 0.5
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, -0.5
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_1.0_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, 1.0
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, -1.0
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_2.0_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, 2.0
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, -2.0
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_4.0_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, 4.0
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, -4.0
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}commute_add_inline_imm_0.5_f32:
-; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
-; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
-; CHECK: buffer_store_dword [[REG]]
-define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %x = load float, float addrspace(1)* %in
-  %y = fadd float %x, 0.5
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}commute_add_literal_f32:
-; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
-; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]]
-; CHECK: buffer_store_dword [[REG]]
-define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %x = load float, float addrspace(1)* %in
-  %y = fadd float %x, 1024.0
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_1_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1, [[VAL]]{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, 0x36a0000000000000
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_2_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2, [[VAL]]{{$}}
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, 0x36b0000000000000
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_16_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 16, [[VAL]]
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, 0x36e0000000000000
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1, [[VAL]]
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, 0xffffffffe0000000
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2, [[VAL]]
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, 0xffffffffc0000000
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -16, [[VAL]]
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, 0xfffffffe00000000
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_63_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 63, [[VAL]]
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, 0x36ff800000000000
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_64_f32:
-; CHECK: s_load_dword [[VAL:s[0-9]+]]
-; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 64, [[VAL]]
-; CHECK: buffer_store_dword [[REG]]
-define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
-  %y = fadd float %x, 0x3700000000000000
-  store float %y, float addrspace(1)* %out
-  ret void
-}
-
-
-; CHECK-LABEL: {{^}}add_inline_imm_0.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, 0.0
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_0.5_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0.5, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, 0.5
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -0.5, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, -0.5
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_1.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, 1.0
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1.0, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, -1.0
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_2.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2.0, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, 2.0
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2.0, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, -2.0
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_4.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 4.0, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, 4.0
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -4.0, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, -4.0
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-
-; CHECK-LABEL: {{^}}add_inline_imm_1_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, 0x0000000000000001
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_2_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, 0x0000000000000002
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_16_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 16, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, 0x0000000000000010
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, 0xffffffffffffffff
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, 0xfffffffffffffffe
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -16, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, 0xfffffffffffffff0
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_63_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 63, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, 0x000000000000003F
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}add_inline_imm_64_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 64, [[VAL]]
-; CHECK: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
-  %y = fadd double %x, 0x0000000000000040
-  store double %y, double addrspace(1)* %out
-  ret void
-}
-
-
-; CHECK-LABEL: {{^}}store_inline_imm_0.0_f64:
-; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0
-; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0
-; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
-  store double 0.0, double addrspace(1)* %out
-  ret void
-}
-
-
-; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64:
-; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
-; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
-; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
-  store double -0.0, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_0.5_f64:
-; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000
-; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
-  store double 0.5, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f64:
-; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000
-; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
-  store double -0.5, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_1.0_f64:
-; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000
-; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
-  store double 1.0, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f64:
-; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000
-; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
-  store double -1.0, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_2.0_f64:
-; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0
-; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
-  store double 2.0, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f64:
-; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0
-; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
-  store double -2.0, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_4.0_f64:
-; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000
-; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
-  store double 4.0, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f64:
-; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000
-; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
-  store double -4.0, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_literal_imm_f64:
-; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x40b00000
-; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
-; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_literal_imm_f64(double addrspace(1)* %out) {
-  store double 4096.0, double addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/indirect-addressing-si.ll b/test/CodeGen/R600/indirect-addressing-si.ll
deleted file mode 100644
index f551606d63a..00000000000
--- a/test/CodeGen/R600/indirect-addressing-si.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; Tests for indirect addressing on SI, which is implemented using dynamic
-; indexing of vectors.
-
-; CHECK-LABEL: {{^}}extract_w_offset:
-; CHECK: s_mov_b32 m0
-; CHECK-NEXT: v_movrels_b32_e32
-define void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = add i32 %in, 1
-  %1 = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %0
-  store float %1, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}extract_wo_offset:
-; CHECK: s_mov_b32 m0
-; CHECK-NEXT: v_movrels_b32_e32
-define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}extract_neg_offset_sgpr:
-; The offset depends on the register that holds the first element of the vector.
-; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
-; CHECK: v_movrels_b32_e32 v{{[0-9]}}, v0
-define void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
-entry:
-  %index = add i32 %offset, -512
-  %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
-  store i32 %value, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}extract_neg_offset_vgpr:
-; The offset depends on the register that holds the first element of the vector.
-; CHECK: v_readfirstlane_b32
-; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}}
-; CHECK-NEXT: v_movrels_b32_e32 v{{[0-9]}}, v0
-; CHECK: s_cbranch_execnz
-define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
-entry:
-  %id = call i32 @llvm.r600.read.tidig.x() #1
-  %index = add i32 %id, -512
-  %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
-  store i32 %value, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}insert_w_offset:
-; CHECK: s_mov_b32 m0
-; CHECK-NEXT: v_movreld_b32_e32
-define void @insert_w_offset(float addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = add i32 %in, 1
-  %1 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %0
-  %2 = extractelement <4 x float> %1, i32 2
-  store float %2, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}insert_wo_offset:
-; CHECK: s_mov_b32 m0
-; CHECK-NEXT: v_movreld_b32_e32
-define void @insert_wo_offset(float addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
-  %1 = extractelement <4 x float> %0, i32 2
-  store float %1, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}insert_neg_offset_sgpr:
-; The offset depends on the register that holds the first element of the vector.
-; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
-; CHECK: v_movreld_b32_e32 v0, v{{[0-9]}}
-define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
-entry:
-  %index = add i32 %offset, -512
-  %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
-  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}insert_neg_offset_vgpr:
-; The offset depends on the register that holds the first element of the vector.
-; CHECK: v_readfirstlane_b32
-; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}}
-; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}}
-; CHECK: s_cbranch_execnz
-define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
-entry:
-  %id = call i32 @llvm.r600.read.tidig.x() #1
-  %index = add i32 %id, -512
-  %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
-  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}insert_neg_inline_offset_vgpr:
-; The offset depends on the register that holds the first element of the vector.
-; CHECK: v_readfirstlane_b32
-; CHECK: s_add_i32 m0, m0, -{{[0-9]+}}
-; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}}
-; CHECK: s_cbranch_execnz
-define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
-entry:
-  %id = call i32 @llvm.r600.read.tidig.x() #1
-  %index = add i32 %id, -16
-  %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
-  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-declare i32 @llvm.r600.read.tidig.x() #1
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/indirect-private-64.ll b/test/CodeGen/R600/indirect-private-64.ll
deleted file mode 100644
index d63e1b6c521..00000000000
--- a/test/CodeGen/R600/indirect-private-64.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
-
-
-declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
-
-; SI-LABEL: {{^}}private_access_f64_alloca:
-
-; SI-ALLOCA: buffer_store_dwordx2
-; SI-ALLOCA: buffer_load_dwordx2
-
-; SI-PROMOTE: ds_write_b64
-; SI-PROMOTE: ds_read_b64
-define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
-  %val = load double, double addrspace(1)* %in, align 8
-  %array = alloca double, i32 16, align 8
-  %ptr = getelementptr double, double* %array, i32 %b
-  store double %val, double* %ptr, align 8
-  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
-  %result = load double, double* %ptr, align 8
-  store double %result, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}private_access_v2f64_alloca:
-
-; SI-ALLOCA: buffer_store_dwordx4
-; SI-ALLOCA: buffer_load_dwordx4
-
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
-define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
-  %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
-  %array = alloca <2 x double>, i32 16, align 16
-  %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b
-  store <2 x double> %val, <2 x double>* %ptr, align 16
-  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
-  %result = load <2 x double>, <2 x double>* %ptr, align 16
-  store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}private_access_i64_alloca:
-
-; SI-ALLOCA: buffer_store_dwordx2
-; SI-ALLOCA: buffer_load_dwordx2
-
-; SI-PROMOTE: ds_write_b64
-; SI-PROMOTE: ds_read_b64
-define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
-  %val = load i64, i64 addrspace(1)* %in, align 8
-  %array = alloca i64, i32 16, align 8
-  %ptr = getelementptr i64, i64* %array, i32 %b
-  store i64 %val, i64* %ptr, align 8
-  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
-  %result = load i64, i64* %ptr, align 8
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}private_access_v2i64_alloca:
-
-; SI-ALLOCA: buffer_store_dwordx4
-; SI-ALLOCA: buffer_load_dwordx4
-
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
-define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
-  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
-  %array = alloca <2 x i64>, i32 16, align 16
-  %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b
-  store <2 x i64> %val, <2 x i64>* %ptr, align 16
-  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
-  %result = load <2 x i64>, <2 x i64>* %ptr, align 16
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
-  ret void
-}
diff --git a/test/CodeGen/R600/infinite-loop-evergreen.ll b/test/CodeGen/R600/infinite-loop-evergreen.ll
deleted file mode 100644
index f6e39b3d830..00000000000
--- a/test/CodeGen/R600/infinite-loop-evergreen.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; XFAIL: *
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
-
-define void @inf_loop_irreducible_cfg() nounwind {
-entry:
-  br label %block
-
-block:
-  br label %block
-}
diff --git a/test/CodeGen/R600/infinite-loop.ll b/test/CodeGen/R600/infinite-loop.ll
deleted file mode 100644
index 7233aa57fd7..00000000000
--- a/test/CodeGen/R600/infinite-loop.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}infinite_loop:
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
-; SI: BB0_1:
-; SI: buffer_store_dword [[REG]]
-; SI: s_waitcnt vmcnt(0) expcnt(0)
-; SI: s_branch BB0_1
-define void @infinite_loop(i32 addrspace(1)* %out) {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  store i32 999, i32 addrspace(1)* %out, align 4
-  br label %for.body
-}
-
diff --git a/test/CodeGen/R600/inline-asm.ll b/test/CodeGen/R600/inline-asm.ll
deleted file mode 100644
index efc2292de3a..00000000000
--- a/test/CodeGen/R600/inline-asm.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; CHECK: {{^}}inline_asm:
-; CHECK: s_endpgm
-; CHECK: s_endpgm
-define void @inline_asm(i32 addrspace(1)* %out) {
-entry:
-  store i32 5, i32 addrspace(1)* %out
-  call void asm sideeffect "s_endpgm", ""()
-  ret void
-}
diff --git a/test/CodeGen/R600/inline-calls.ll b/test/CodeGen/R600/inline-calls.ll
deleted file mode 100644
index 33a4c832e75..00000000000
--- a/test/CodeGen/R600/inline-calls.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck  %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck  %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK-NOT: {{^}}func:
-define internal fastcc i32 @func(i32 %a) {
-entry:
-  %tmp0 = add i32 %a, 1
-  ret i32 %tmp0
-}
-
-; CHECK: {{^}}kernel:
-define void @kernel(i32 addrspace(1)* %out) {
-entry:
-  %tmp0 = call i32 @func(i32 1)
-  store i32 %tmp0, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}kernel2:
-define void @kernel2(i32 addrspace(1)* %out) {
-entry:
-  call void @kernel(i32 addrspace(1)* %out)
-  ret void
-}
diff --git a/test/CodeGen/R600/input-mods.ll b/test/CodeGen/R600/input-mods.ll
deleted file mode 100644
index 1c4d285cbcb..00000000000
--- a/test/CodeGen/R600/input-mods.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG
-;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM
-
-;EG-LABEL: {{^}}test:
-;EG: EXP_IEEE *
-;CM-LABEL: {{^}}test:
-;CM: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X|
-;CM: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X|
-;CM: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X|
-;CM: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X|
-
-define void @test(<4 x float> inreg %reg0) #0 {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = call float @llvm.fabs.f32(float %r0)
-   %r2 = fsub float -0.000000e+00, %r1
-   %r3 = call float @llvm.exp2.f32(float %r2)
-   %vec = insertelement <4 x float> undef, float %r3, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
-   ret void
-}
-
-declare float @llvm.exp2.f32(float) readnone
-declare float @llvm.fabs.f32(float) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/insert_subreg.ll b/test/CodeGen/R600/insert_subreg.ll
deleted file mode 100644
index 4a5e8869c2d..00000000000
--- a/test/CodeGen/R600/insert_subreg.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s
-
-; Test that INSERT_SUBREG instructions don't have non-register operands after
-; instruction selection.
-
-; Make sure this doesn't crash
-; CHECK-LABEL: test:
-define void @test(i64 addrspace(1)* %out) {
-entry:
-  %tmp0 = alloca [16 x i32]
-  %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32
-  %tmp2 = sext i32 %tmp1 to i64
-  store i64 %tmp2, i64 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/insert_vector_elt.ll b/test/CodeGen/R600/insert_vector_elt.ll
deleted file mode 100644
index 6de3d408c48..00000000000
--- a/test/CodeGen/R600/insert_vector_elt.ll
+++ /dev/null
@@ -1,252 +0,0 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
-
-; FIXME: Broken on evergreen
-; FIXME: For some reason the 8 and 16 vectors are being stored as
-; individual elements instead of 128-bit stores.
-
-
-; FIXME: Why is the constant moved into the intermediate register and
-; not just directly into the vector component?
-
-; SI-LABEL: {{^}}insertelement_v4f32_0:
-; s_load_dwordx4 s{{[}}[[LOW_REG:[0-9]+]]:
-; v_mov_b32_e32
-; v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00
-; v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]]
-; buffer_store_dwordx4 v{{[}}[[LOW_REG]]:
-define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
-  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
-  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}insertelement_v4f32_1:
-define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
-  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
-  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}insertelement_v4f32_2:
-define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
-  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
-  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}insertelement_v4f32_3:
-define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
-  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
-  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}insertelement_v4i32_0:
-define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
-  %vecins = insertelement <4 x i32> %a, i32 999, i32 0
-  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v2f32:
-; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
-; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
-; SI: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
-define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
-  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
-  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v4f32:
-; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
-; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
-; SI: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
-define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
-  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
-  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v8f32:
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
-define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
-  %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
-  store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v16f32:
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
-define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
-  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
-  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v2i32:
-; SI: buffer_store_dwordx2
-define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
-  %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
-  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v4i32:
-; SI: buffer_store_dwordx4
-define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind {
-  %vecins = insertelement <4 x i32> %a, i32 5, i32 %b
-  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v8i32:
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
-define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
-  %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
-  store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v16i32:
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
-define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
-  %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
-  store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
-  ret void
-}
-
-
-; SI-LABEL: {{^}}dynamic_insertelement_v2i16:
-; FIXMESI: buffer_store_dwordx2
-define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
-  %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v4i16:
-; FIXMESI: buffer_store_dwordx4
-define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
-  %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
-  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16
-  ret void
-}
-
-
-; SI-LABEL: {{^}}dynamic_insertelement_v2i8:
-; FIXMESI: BUFFER_STORE_USHORT
-define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
-  %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
-  store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v4i8:
-; FIXMESI: buffer_store_dword
-define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
-  %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
-  store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v8i8:
-; FIXMESI: buffer_store_dwordx2
-define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
-  %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
-  store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v16i8:
-; FIXMESI: buffer_store_dwordx4
-define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
-  %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
-  store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
-  ret void
-}
-
-; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
-; the compiler doesn't crash.
-; SI-LABEL: {{^}}insert_split_bb:
-define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
-entry:
-  %0 = insertelement <2 x i32> undef, i32 %a, i32 0
-  %1 = icmp eq i32 %a, 0
-  br i1 %1, label %if, label %else
-
-if:
-  %2 = load i32, i32 addrspace(1)* %in
-  %3 = insertelement <2 x i32> %0, i32 %2, i32 1
-  br label %endif
-
-else:
-  %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %5 = load i32, i32 addrspace(1)* %4
-  %6 = insertelement <2 x i32> %0, i32 %5, i32 1
-  br label %endif
-
-endif:
-  %7 = phi <2 x i32> [%3, %if], [%6, %else]
-  store <2 x i32> %7, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v2f64:
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
-  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
-  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v2i64:
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
-  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
-  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v4f64:
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
-  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
-  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v8f64:
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
-  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
-  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
-  ret void
-}
diff --git a/test/CodeGen/R600/jump-address.ll b/test/CodeGen/R600/jump-address.ll
deleted file mode 100644
index f55912e3740..00000000000
--- a/test/CodeGen/R600/jump-address.ll
+++ /dev/null
@@ -1,52 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; CHECK: JUMP @6
-; CHECK: EXPORT
-; CHECK-NOT: EXPORT
-
-define void @main() #0 {
-main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %1 = extractelement <4 x float> %0, i32 0
-  %2 = bitcast float %1 to i32
-  %3 = icmp eq i32 %2, 0
-  %4 = sext i1 %3 to i32
-  %5 = bitcast i32 %4 to float
-  %6 = bitcast float %5 to i32
-  %7 = icmp ne i32 %6, 0
-  br i1 %7, label %ENDIF, label %ELSE
-
-ELSE:                                             ; preds = %main_body
-  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %9 = extractelement <4 x float> %8, i32 0
-  %10 = bitcast float %9 to i32
-  %11 = icmp eq i32 %10, 1
-  %12 = sext i1 %11 to i32
-  %13 = bitcast i32 %12 to float
-  %14 = bitcast float %13 to i32
-  %15 = icmp ne i32 %14, 0
-  br i1 %15, label %IF13, label %ENDIF
-
-ENDIF:                                            ; preds = %IF13, %ELSE, %main_body
-  %temp.0 = phi float [ 0xFFF8000000000000, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ]
-  %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ]
-  %temp2.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ]
-  %temp3.0 = phi float [ 5.000000e-01, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ]
-  %16 = insertelement <4 x float> undef, float %temp.0, i32 0
-  %17 = insertelement <4 x float> %16, float %temp1.0, i32 1
-  %18 = insertelement <4 x float> %17, float %temp2.0, i32 2
-  %19 = insertelement <4 x float> %18, float %temp3.0, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0)
-  ret void
-
-IF13:                                             ; preds = %ELSE
-  %20 = load <4 x float>, <4 x float> addrspace(8)* null
-  %21 = extractelement <4 x float> %20, i32 0
-  %22 = fsub float -0.000000e+00, %21
-  %23 = fadd float 0xFFF8000000000000, %22
-  br label %ENDIF
-}
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/kcache-fold.ll b/test/CodeGen/R600/kcache-fold.ll
deleted file mode 100644
index 7e2291cfdc3..00000000000
--- a/test/CodeGen/R600/kcache-fold.ll
+++ /dev/null
@@ -1,100 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; CHECK: {{^}}main1:
-; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}}
-define void @main1() {
-main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* null
-  %1 = extractelement <4 x float> %0, i32 0
-  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %3 = extractelement <4 x float> %2, i32 0
-  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %5 = extractelement <4 x float> %4, i32 0
-  %6 = fcmp ogt float %1, 0.000000e+00
-  %7 = select i1 %6, float %3, float %5
-  %8 = load <4 x float>, <4 x float> addrspace(8)* null
-  %9 = extractelement <4 x float> %8, i32 1
-  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %11 = extractelement <4 x float> %10, i32 1
-  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %13 = extractelement <4 x float> %12, i32 1
-  %14 = fcmp ogt float %9, 0.000000e+00
-  %15 = select i1 %14, float %11, float %13
-  %16 = load <4 x float>, <4 x float> addrspace(8)* null
-  %17 = extractelement <4 x float> %16, i32 2
-  %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %19 = extractelement <4 x float> %18, i32 2
-  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %21 = extractelement <4 x float> %20, i32 2
-  %22 = fcmp ogt float %17, 0.000000e+00
-  %23 = select i1 %22, float %19, float %21
-  %24 = load <4 x float>, <4 x float> addrspace(8)* null
-  %25 = extractelement <4 x float> %24, i32 3
-  %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %27 = extractelement <4 x float> %26, i32 3
-  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %29 = extractelement <4 x float> %28, i32 3
-  %30 = fcmp ogt float %25, 0.000000e+00
-  %31 = select i1 %30, float %27, float %29
-  %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
-  %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
-  %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
-  %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
-  %36 = insertelement <4 x float> undef, float %32, i32 0
-  %37 = insertelement <4 x float> %36, float %33, i32 1
-  %38 = insertelement <4 x float> %37, float %34, i32 2
-  %39 = insertelement <4 x float> %38, float %35, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
-  ret void
-}
-
-; CHECK: {{^}}main2:
-; CHECK-NOT: MOV
-define void @main2() {
-main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* null
-  %1 = extractelement <4 x float> %0, i32 0
-  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %3 = extractelement <4 x float> %2, i32 0
-  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %5 = extractelement <4 x float> %4, i32 1
-  %6 = fcmp ogt float %1, 0.000000e+00
-  %7 = select i1 %6, float %3, float %5
-  %8 = load <4 x float>, <4 x float> addrspace(8)* null
-  %9 = extractelement <4 x float> %8, i32 1
-  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %11 = extractelement <4 x float> %10, i32 0
-  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %13 = extractelement <4 x float> %12, i32 1
-  %14 = fcmp ogt float %9, 0.000000e+00
-  %15 = select i1 %14, float %11, float %13
-  %16 = load <4 x float>, <4 x float> addrspace(8)* null
-  %17 = extractelement <4 x float> %16, i32 2
-  %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %19 = extractelement <4 x float> %18, i32 3
-  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %21 = extractelement <4 x float> %20, i32 2
-  %22 = fcmp ogt float %17, 0.000000e+00
-  %23 = select i1 %22, float %19, float %21
-  %24 = load <4 x float>, <4 x float> addrspace(8)* null
-  %25 = extractelement <4 x float> %24, i32 3
-  %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %27 = extractelement <4 x float> %26, i32 3
-  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %29 = extractelement <4 x float> %28, i32 2
-  %30 = fcmp ogt float %25, 0.000000e+00
-  %31 = select i1 %30, float %27, float %29
-  %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
-  %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
-  %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
-  %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
-  %36 = insertelement <4 x float> undef, float %32, i32 0
-  %37 = insertelement <4 x float> %36, float %33, i32 1
-  %38 = insertelement <4 x float> %37, float %34, i32 2
-  %39 = insertelement <4 x float> %38, float %35, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
-  ret void
-}
-
-declare float @llvm.AMDIL.clamp.(float, float, float) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/R600/kernel-args.ll b/test/CodeGen/R600/kernel-args.ll
deleted file mode 100644
index 1dd7c2cb799..00000000000
--- a/test/CodeGen/R600/kernel-args.ll
+++ /dev/null
@@ -1,473 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-
-; FUNC-LABEL: {{^}}i8_arg:
-; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; GCN: buffer_load_ubyte
-
-define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
-entry:
-  %0 = zext i8 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i8_zext_arg:
-; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
-; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-
-define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
-entry:
-  %0 = zext i8 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i8_sext_arg:
-; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
-; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-
-define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
-entry:
-  %0 = sext i8 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i16_arg:
-; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; GCN: buffer_load_ushort
-
-define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
-entry:
-  %0 = zext i16 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i16_zext_arg:
-; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
-; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-
-define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
-entry:
-  %0 = zext i16 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i16_sext_arg:
-; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
-; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-
-define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
-entry:
-  %0 = sext i16 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i32_arg:
-; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
-; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
-entry:
-  store i32 %in, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f32_arg:
-; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
-; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
-entry:
-  store float %in, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v2i8_arg:
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
-entry:
-  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v2i16_arg:
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; GCN-DAG: buffer_load_ushort
-; GCN-DAG: buffer_load_ushort
-define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
-entry:
-  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v2i32_arg:
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
-; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
-; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
-define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
-entry:
-  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v2f32_arg:
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
-; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
-; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
-define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
-entry:
-  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v3i8_arg:
-; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
-; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
-; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
-define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
-entry:
-  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v3i16_arg:
-; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
-; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
-; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
-define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
-entry:
-  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
-  ret void
-}
-; FUNC-LABEL: {{^}}v3i32_arg:
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
-; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
-; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
-define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
-entry:
-  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v3f32_arg:
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
-; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
-; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
-define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
-entry:
-  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v4i8_arg:
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
-entry:
-  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v4i16_arg:
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
-entry:
-  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v4i32_arg:
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
-; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
-; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
-define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
-entry:
-  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v4f32_arg:
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
-; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
-; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
-define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
-entry:
-  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v8i8_arg:
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
-entry:
-  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v8i16_arg:
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
-entry:
-  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v8i32_arg:
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
-; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
-define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
-entry:
-  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v8f32_arg:
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
-define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
-entry:
-  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v16i8_arg:
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; EG: VTX_READ_8
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
-entry:
-  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v16i16_arg:
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; EG: VTX_READ_16
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
-entry:
-  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v16i32_arg:
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
-; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
-define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
-entry:
-  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v16f32_arg:
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
-; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
-; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
-define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
-entry:
-  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}kernel_arg_i64:
-; GCN: s_load_dwordx2
-; GCN: s_load_dwordx2
-; GCN: buffer_store_dwordx2
-define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
-  store i64 %a, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f64_kernel_arg:
-; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
-; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
-; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
-; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
-; GCN: buffer_store_dwordx2
-define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
-entry:
-  store double %in, double addrspace(1)* %out
-  ret void
-}
-
-; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
-; XGCN: s_load_dwordx2
-; XGCN: s_load_dwordx2
-; XGCN: buffer_store_dwordx2
-; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
-;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
-;   ret void
-; }
diff --git a/test/CodeGen/R600/large-alloca.ll b/test/CodeGen/R600/large-alloca.ll
deleted file mode 100644
index 671833d1a33..00000000000
--- a/test/CodeGen/R600/large-alloca.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; XFAIL: *
-; REQUIRES: asserts
-; RUN: llc -march=amdgcn -mcpu=SI < %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s
-
-define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind {
-  %large = alloca [8192 x i32], align 4
-  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
-  store i32 %x, i32* %gep
-  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
-  %0 = load i32, i32* %gep1
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
diff --git a/test/CodeGen/R600/large-constant-initializer.ll b/test/CodeGen/R600/large-constant-initializer.ll
deleted file mode 100644
index 9975b1b7f5c..00000000000
--- a/test/CodeGen/R600/large-constant-initializer.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s
-; CHECK: s_endpgm
-
-@gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
-
-define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
-  %val = load i32, i32 addrspace(2)* getelementptr ([239 x i32], [239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4
-  %mul12 = mul nsw i32 %val, 7
-  br i1 undef, label %exit, label %bb
-
-bb:
-  %cmp = icmp slt i32 %x, 0
-  br label %exit
-
-exit:
-  ret void
-}
-
diff --git a/test/CodeGen/R600/lds-initializer.ll b/test/CodeGen/R600/lds-initializer.ll
deleted file mode 100644
index bf8df63be9f..00000000000
--- a/test/CodeGen/R600/lds-initializer.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
-; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
-
-; CHECK: error: unsupported initializer for address space in load_init_lds_global
-
-@lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
-
-define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
- %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10
-  %ld = load i32, i32 addrspace(3)* %gep
-  store i32 %ld, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/lds-oqap-crash.ll b/test/CodeGen/R600/lds-oqap-crash.ll
deleted file mode 100644
index 6ff6fc3d7af..00000000000
--- a/test/CodeGen/R600/lds-oqap-crash.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
-
-; The test is for a bug in R600EmitClauseMarkers.cpp where this pass
-; was searching for a use of the OQAP register in order to determine
-; if an LDS instruction could fit in the current clause, but never finding
-; one.  This created an infinite loop and hung the compiler.
-;
-; The LDS instruction should not have been defining OQAP in the first place,
-; because the LDS instructions are pseudo instructions and the OQAP
-; reads and writes are bundled together in the same instruction.
-
-; CHECK: {{^}}lds_crash:
-define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
-entry:
-  %0 = load i32, i32 addrspace(3)* %in
-  ; This block needs to be > 115 ISA instructions to hit the bug,
-  ; so we'll use udiv instructions.
-  %div0 = udiv i32 %0, %b
-  %div1 = udiv i32 %div0, %a
-  %div2 = udiv i32 %div1, 11
-  %div3 = udiv i32 %div2, %a
-  %div4 = udiv i32 %div3, %b
-  %div5 = udiv i32 %div4, %c
-  %div6 = udiv i32 %div5, %div0
-  %div7 = udiv i32 %div6, %div1
-  store i32 %div7, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/lds-output-queue.ll b/test/CodeGen/R600/lds-output-queue.ll
deleted file mode 100644
index 44ffc36af14..00000000000
--- a/test/CodeGen/R600/lds-output-queue.ll
+++ /dev/null
@@ -1,99 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
-;
-; This test checks that the lds input queue will is empty at the end of
-; the ALU clause.
-
-; CHECK-LABEL: {{^}}lds_input_queue:
-; CHECK: LDS_READ_RET * OQAP
-; CHECK-NOT: ALU clause
-; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
-
-@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
-
-define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
-entry:
-  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
-  %1 = load i32, i32 addrspace(3)* %0
-  call void @llvm.AMDGPU.barrier.local()
-
-  ; This will start a new clause for the vertex fetch
-  %2 = load i32, i32 addrspace(1)* %in
-  %3 = add i32 %1, %2
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
-
-declare void @llvm.AMDGPU.barrier.local()
-
-; The machine scheduler does not do proper alias analysis and assumes that
-; loads from global values (Note that a global value is different that a
-; value from global memory.  A global value is a value that is declared
-; outside of a function, it can reside in any address space) alias with
-; all other loads.
-;
-; This is a problem for scheduling the reads from the local data share (lds).
-; These reads are implemented using two instructions.  The first copies the
-; data from lds into the lds output queue, and the second moves the data from
-; the input queue into main memory.  These two instructions don't have to be
-; scheduled one after the other, but they do need to be scheduled in the same
-; clause.  The aliasing problem mentioned above causes problems when there is a
-; load from global memory which immediately follows a load from a global value that
-; has been declared in the local memory space:
-;
-;  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
-;  %1 = load i32, i32 addrspace(3)* %0
-;  %2 = load i32, i32 addrspace(1)* %in
-;
-; The instruction selection phase will generate ISA that looks like this:
-; %OQAP = LDS_READ_RET
-; %vreg0 = MOV %OQAP
-; %vreg1 = VTX_READ_32
-; %vreg2 = ADD_INT %vreg1, %vreg0
-;
-; The bottom scheduler will schedule the two ALU instructions first:
-;
-; UNSCHEDULED:
-; %OQAP = LDS_READ_RET
-; %vreg1 = VTX_READ_32
-;
-; SCHEDULED:
-;
-; vreg0 = MOV %OQAP
-; vreg2 = ADD_INT %vreg1, %vreg2
-;
-; The lack of proper aliasing results in the local memory read (LDS_READ_RET)
-; to consider the global memory read (VTX_READ_32) has a chain dependency, so
-; the global memory read will always be scheduled first.  This will give us a
-; final program which looks like this:
-;
-; Alu clause:
-; %OQAP = LDS_READ_RET
-; VTX clause:
-; %vreg1 = VTX_READ_32
-; Alu clause:
-; vreg0 = MOV %OQAP
-; vreg2 = ADD_INT %vreg1, %vreg2
-;
-; This is an illegal program because the OQAP def and use know occur in
-; different ALU clauses.
-;
-; This test checks this scenario and makes sure it doesn't result in an
-; illegal program.  For now, we have fixed this issue by merging the
-; LDS_READ_RET and MOV together during instruction selection and then
-; expanding them after scheduling.  Once the scheduler has better alias
-; analysis, we should be able to keep these instructions sparate before
-; scheduling.
-;
-; CHECK-LABEL: {{^}}local_global_alias:
-; CHECK: LDS_READ_RET
-; CHECK-NOT: ALU clause
-; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
-define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
-  %1 = load i32, i32 addrspace(3)* %0
-  %2 = load i32, i32 addrspace(1)* %in
-  %3 = add i32 %2, %1
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/lds-size.ll b/test/CodeGen/R600/lds-size.ll
deleted file mode 100644
index 3e8328659fd..00000000000
--- a/test/CodeGen/R600/lds-size.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; This test makes sure we do not double count global values when they are
-; used in different basic blocks.
-
-; CHECK: .long   166120
-; CHECK-NEXT: .long   1
-; CHECK-LABEL: {{^}}test:
-@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
-
-define void @test(i32 addrspace(1)* %out, i32 %cond) {
-entry:
-  %0 = icmp eq i32 %cond, 0
-  br i1 %0, label %if, label %else
-
-if:
-  store i32 1, i32 addrspace(3)* @lds
-  br label %endif
-
-else:
-  store i32 2, i32 addrspace(3)* @lds
-  br label %endif
-
-endif:
-  ret void
-}
diff --git a/test/CodeGen/R600/lds-zero-initializer.ll b/test/CodeGen/R600/lds-zero-initializer.ll
deleted file mode 100644
index fb51bc0e50c..00000000000
--- a/test/CodeGen/R600/lds-zero-initializer.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
-; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
-
-; CHECK: error: unsupported initializer for address space in load_zeroinit_lds_global
-
-@lds = addrspace(3) global [256 x i32] zeroinitializer
-
-define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
- %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10
-  %ld = load i32, i32 addrspace(3)* %gep
-  store i32 %ld, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll b/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll
deleted file mode 100644
index 4244c48d240..00000000000
--- a/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; This tests a bug where LegalizeDAG was not checking the target's
-; BooleanContents value and always using one for true, when expanding
-; setcc to select_cc.
-;
-; This bug caused the icmp IR instruction to be expanded to two machine
-; instructions, when only one is needed.
-;
-
-; CHECK: {{^}}setcc_expand:
-; CHECK: SET
-; CHECK-NOT: CND
-define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = icmp eq i32 %in, 5
-  br i1 %0, label %IF, label %ENDIF
-IF:
-  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  store i32 0, i32 addrspace(1)* %1
-  br label %ENDIF
-
-ENDIF:
-  store i32 0, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/lit.local.cfg b/test/CodeGen/R600/lit.local.cfg
deleted file mode 100644
index ad9ce2541ef..00000000000
--- a/test/CodeGen/R600/lit.local.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-if not 'R600' in config.root.targets:
-    config.unsupported = True
diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll
deleted file mode 100644
index cff1c24f89d..00000000000
--- a/test/CodeGen/R600/literals.ll
+++ /dev/null
@@ -1,64 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; Test using an integer literal constant.
-; Generated ASM should be:
-; ADD_INT KC0[2].Z literal.x, 5
-; or
-; ADD_INT literal.x KC0[2].Z, 5
-
-; CHECK: {{^}}i32_literal:
-; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 5
-define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = add i32 5, %in
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; Test using a float literal constant.
-; Generated ASM should be:
-; ADD KC0[2].Z literal.x, 5.0
-; or
-; ADD literal.x KC0[2].Z, 5.0
-
-; CHECK: {{^}}float_literal:
-; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 1084227584(5.0
-define void @float_literal(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = fadd float 5.0, %in
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-; Make sure inline literals are folded into REG_SEQUENCE instructions.
-; CHECK: {{^}}inline_literal_reg_sequence:
-; CHECK: MOV {{\** *}}T[[GPR:[0-9]]].X, 0.0
-; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Y, 0.0
-; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0
-; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0
-
-define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) {
-entry:
-  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}inline_literal_dot4:
-; CHECK: DOT4 T[[GPR:[0-9]]].X, 1.0
-; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0
-; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0
-; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0
-define void @inline_literal_dot4(float addrspace(1)* %out) {
-entry:
-  %0 = call float @llvm.AMDGPU.dp4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-
-attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/llvm.AMDGPU.abs.ll b/test/CodeGen/R600/llvm.AMDGPU.abs.ll
deleted file mode 100644
index 8bf094b8bc7..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.abs.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone
-
-; Legacy name
-declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}s_abs_i32:
-; SI: s_sub_i32
-; SI: s_max_i32
-; SI: s_endpgm
-
-; EG: SUB_INT
-; EG: MAX_INT
-define void @s_abs_i32(i32 addrspace(1)* %out, i32 %src) nounwind {
-  %abs = call i32 @llvm.AMDGPU.abs(i32 %src) nounwind readnone
-  store i32 %abs, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_abs_i32:
-; SI: v_sub_i32_e32
-; SI: v_max_i32_e32
-; SI: s_endpgm
-
-; EG: SUB_INT
-; EG: MAX_INT
-define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
-  %val = load i32, i32 addrspace(1)* %src, align 4
-  %abs = call i32 @llvm.AMDGPU.abs(i32 %val) nounwind readnone
-  store i32 %abs, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}abs_i32_legacy_amdil:
-; SI: v_sub_i32_e32
-; SI: v_max_i32_e32
-; SI: s_endpgm
-
-; EG: SUB_INT
-; EG: MAX_INT
-define void @abs_i32_legacy_amdil(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
-  %val = load i32, i32 addrspace(1)* %src, align 4
-  %abs = call i32 @llvm.AMDIL.abs.i32(i32 %val) nounwind readnone
-  store i32 %abs, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll
deleted file mode 100644
index db883972d64..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_barrier_global:
-; EG: GROUP_BARRIER
-; SI: buffer_store_dword
-; SI: s_waitcnt
-; SI: s_barrier
-
-define void @test_barrier_global(i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.tidig.x()
-  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
-  store i32 %0, i32 addrspace(1)* %1
-  call void @llvm.AMDGPU.barrier.global()
-  %2 = call i32 @llvm.r600.read.local.size.x()
-  %3 = sub i32 %2, 1
-  %4 = sub i32 %3, %0
-  %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
-  %6 = load i32, i32 addrspace(1)* %5
-  store i32 %6, i32 addrspace(1)* %1
-  ret void
-}
-
-declare void @llvm.AMDGPU.barrier.global()
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare i32 @llvm.r600.read.local.size.x() #0
-
-attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
deleted file mode 100644
index 48fb2e0b1a8..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_barrier_local:
-; EG: GROUP_BARRIER
-
-; SI: buffer_store_dword
-; SI: s_waitcnt
-; SI: s_barrier
-
-define void @test_barrier_local(i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.tidig.x()
-  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
-  store i32 %0, i32 addrspace(1)* %1
-  call void @llvm.AMDGPU.barrier.local()
-  %2 = call i32 @llvm.r600.read.local.size.x()
-  %3 = sub i32 %2, 1
-  %4 = sub i32 %3, %0
-  %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
-  %6 = load i32, i32 addrspace(1)* %5
-  store i32 %6, i32 addrspace(1)* %1
-  ret void
-}
-
-declare void @llvm.AMDGPU.barrier.local()
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare i32 @llvm.r600.read.local.size.x() #0
-
-attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
deleted file mode 100644
index 1168713ca66..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
+++ /dev/null
@@ -1,437 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_arg_arg:
-; SI: v_bfe_i32
-; EG: BFE_INT
-; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac
-define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm:
-; SI: v_bfe_i32
-; EG: BFE_INT
-define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg:
-; SI: v_bfe_i32
-; EG: BFE_INT
-define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg:
-; SI: v_bfe_i32
-; EG: BFE_INT
-define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_bfe_print_arg:
-; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
-define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind {
-  %load = load i32, i32 addrspace(1)* %src0, align 4
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset:
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset:
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_6:
-; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI: s_endpgm
-define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_7:
-; SI-NOT: shl
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_8:
-; SI: buffer_load_dword
-; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
-; SI: s_endpgm
-define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_9:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_10:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_11:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_12:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_13:
-; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = ashr i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_14:
-; SI-NOT: lshr
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = lshr i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_0:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_1:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_2:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_3:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_4:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_5:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_6:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_7:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_8:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_9:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_10:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_11:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -6
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_12:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_13:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_14:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_15:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_16:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_17:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_18:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_sext_in_reg_i24:
-; SI: buffer_load_dword [[LOAD:v[0-9]+]],
-; SI-NOT: v_lshl
-; SI-NOT: v_ashr
-; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24
-; SI: buffer_store_dword [[BFE]],
-define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24)
-  %shl = shl i32 %bfe, 8
-  %ashr = ashr i32 %shl, 8
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @simplify_demanded_bfe_sdiv
-; SI: buffer_load_dword [[LOAD:v[0-9]+]]
-; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16
-; SI: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]]
-; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], [[TMP0]], [[BFE]]
-; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
-; SI: buffer_store_dword [[TMP2]]
-define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %src = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone
-  %div = sdiv i32 %bfe, 2
-  store i32 %div, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
deleted file mode 100644
index 541119242a9..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
+++ /dev/null
@@ -1,627 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg:
-; SI: v_bfe_u32
-; EG: BFE_UINT
-define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm:
-; SI: v_bfe_u32
-; EG: BFE_UINT
-define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg:
-; SI: v_bfe_u32
-; EG: BFE_UINT
-define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg:
-; SI: v_bfe_u32
-; EG: BFE_UINT
-define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset:
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset:
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zextload_i8:
-; SI: buffer_load_ubyte
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %load = load i8, i8 addrspace(1)* %in
-  %ext = zext i8 %load to i32
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: v_and_b32_e32
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 255
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: v_and_b32_e32
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 65535
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI: bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 255
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8
-; SI-NEXT: bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 255
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80
-; SI-NEXT: bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 255
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 65535
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_1:
-; SI: buffer_load_dword
-; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
-; SI: s_endpgm
-; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1,
-define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_4:
-; SI-NOT: lshl
-; SI-NOT: shr
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %shr = lshr i32 %shl, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_5:
-; SI: buffer_load_dword
-; SI-NOT: lshl
-; SI-NOT: shr
-; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
-; SI: s_endpgm
-define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %shr = ashr i32 %shl, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_6:
-; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI: s_endpgm
-define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_7:
-; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_8:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_9:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_10:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_11:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_12:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_13:
-; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = ashr i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_14:
-; SI-NOT: lshr
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = lshr i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_0:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_1:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_2:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_3:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_4:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_5:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_6:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_7:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_8:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_9:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_10:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_11:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_12:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_13:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_14:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_15:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_16:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_17:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_18:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; Make sure that SimplifyDemandedBits doesn't cause the and to be
-; reduced to the bits demanded by the bfe.
-
-; XXX: The operand to v_bfe_u32 could also just directly be the load register.
-; FUNC-LABEL: {{^}}simplify_bfe_u32_multi_use_arg:
-; SI: buffer_load_dword [[ARG:v[0-9]+]]
-; SI: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]]
-; SI: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2
-; SI-DAG: buffer_store_dword [[AND]]
-; SI-DAG: buffer_store_dword [[BFE]]
-; SI: s_endpgm
-define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
-                                            i32 addrspace(1)* %out1,
-                                            i32 addrspace(1)* %in) nounwind {
-  %src = load i32, i32 addrspace(1)* %in, align 4
-  %and = and i32 %src, 63
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %and, i32 2, i32 2) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4
-  store i32 %and, i32 addrspace(1)* %out1, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lshr_and:
-; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
-; SI: buffer_store_dword
-define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %b = lshr i32 %a, 6
-  %c = and i32 %b, 7
-  store i32 %c, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_lshr_and:
-; SI: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3
-; SI: buffer_store_dword
-define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %c = lshr i32 %a, %b
-  %d = and i32 %c, 7
-  store i32 %d, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}and_lshr:
-; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
-; SI: buffer_store_dword
-define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %b = and i32 %a, 448
-  %c = lshr i32 %b, 6
-  store i32 %c, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}and_lshr2:
-; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
-; SI: buffer_store_dword
-define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %b = and i32 %a, 511
-  %c = lshr i32 %b, 6
-  store i32 %c, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}shl_lshr:
-; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002
-; SI: buffer_store_dword
-define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %b = shl i32 %a, 9
-  %c = lshr i32 %b, 11
-  store i32 %c, i32 addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfi.ll b/test/CodeGen/R600/llvm.AMDGPU.bfi.ll
deleted file mode 100644
index 517a55abc09..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.bfi.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.bfi(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfi_arg_arg_arg:
-; SI: v_bfi_b32
-; EG: BFI_INT
-define void @bfi_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
-  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
-  store i32 %bfi, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfi_arg_arg_imm:
-; SI: v_bfi_b32
-; EG: BFI_INT
-define void @bfi_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 123) nounwind readnone
-  store i32 %bfi, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfi_arg_imm_arg:
-; SI: v_bfi_b32
-; EG: BFI_INT
-define void @bfi_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
-  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 123, i32 %src2) nounwind readnone
-  store i32 %bfi, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfi_imm_arg_arg:
-; SI: v_bfi_b32
-; EG: BFI_INT
-define void @bfi_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
-  %bfi = call i32 @llvm.AMDGPU.bfi(i32 123, i32 %src1, i32 %src2) nounwind readnone
-  store i32 %bfi, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfm.ll b/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
deleted file mode 100644
index 50492289d74..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.bfm(i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfm_arg_arg:
-; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; EG: BFM_INT
-define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 %src1) nounwind readnone
-  store i32 %bfm, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_arg_imm:
-; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x7b
-; EG: BFM_INT
-define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind {
-  %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 123) nounwind readnone
-  store i32 %bfm, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_imm_arg:
-; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, {{s[0-9]+}}
-; EG: BFM_INT
-define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind {
-  %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 %src1) nounwind readnone
-  store i32 %bfm, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_imm_imm:
-; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, 0x1c8
-; EG: BFM_INT
-define void @bfm_imm_imm(i32 addrspace(1)* %out) nounwind {
-  %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 456) nounwind readnone
-  store i32 %bfm, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_pattern:
-; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) {
-  %a = shl i32 1, %x
-  %b = sub i32 %a, 1
-  %c = shl i32 %b, %y
-  store i32 %c, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_pattern_simple:
-; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
-define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) {
-  %a = shl i32 1, %x
-  %b = sub i32 %a, 1
-  store i32 %b, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.brev.ll b/test/CodeGen/R600/llvm.AMDGPU.brev.ll
deleted file mode 100644
index 301de4b1c82..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.brev.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}s_brev_i32:
-; SI: s_load_dword [[VAL:s[0-9]+]],
-; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]]
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
-; SI: s_endpgm
-define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
-  %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
-  store i32 %ctlz, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_brev_i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr, align 4
-  %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
-  store i32 %ctlz, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.clamp.ll b/test/CodeGen/R600/llvm.AMDGPU.clamp.ll
deleted file mode 100644
index 11ec963ab31..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.clamp.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.fabs.f32(float) nounwind readnone
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone
-declare float @llvm.AMDIL.clamp.f32(float, float, float) nounwind readnone
-
-; FUNC-LABEL: {{^}}clamp_0_1_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-
-; EG: MOV_SAT
-define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
-  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone
-  store float %clamp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, |[[ARG]]| clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
-  %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone
-  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fabs, float 0.0, float 1.0) nounwind readnone
-  store float %clamp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -[[ARG]] clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
-  %src.fneg = fsub float -0.0, %src
-  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg, float 0.0, float 1.0) nounwind readnone
-  store float %clamp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -|[[ARG]]| clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
-  %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone
-  %src.fneg.fabs = fsub float -0.0, %src.fabs
-  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg.fabs, float 0.0, float 1.0) nounwind readnone
-  store float %clamp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}clamp_0_1_amdil_legacy_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-define void @clamp_0_1_amdil_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
-  %clamp = call float @llvm.AMDIL.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone
-  store float %clamp, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.class.ll b/test/CodeGen/R600/llvm.AMDGPU.class.ll
deleted file mode 100644
index 805a88b59c7..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.class.ll
+++ /dev/null
@@ -1,497 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare i1 @llvm.AMDGPU.class.f32(float, i32) #1
-declare i1 @llvm.AMDGPU.class.f64(double, i32) #1
-declare i32 @llvm.r600.read.tidig.x() #1
-declare float @llvm.fabs.f32(float) #1
-declare double @llvm.fabs.f64(double) #1
-
-; SI-LABEL: {{^}}test_class_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 %b) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_fabs_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
-  %a.fabs = call float @llvm.fabs.f32(float %a) #1
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fabs, i32 %b) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_fneg_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
-  %a.fneg = fsub float -0.0, %a
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg, i32 %b) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_fneg_fabs_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
-  %a.fabs = call float @llvm.fabs.f32(float %a) #1
-  %a.fneg.fabs = fsub float -0.0, %a.fabs
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg.fabs, i32 %b) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_1_f32:
-; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}}
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_64_f32:
-; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}}
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; Set all 10 bits of mask
-; SI-LABEL: {{^}}test_class_full_mask_f32:
-; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
-; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1023) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_9bit_mask_f32:
-; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
-; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}v_test_class_full_mask_f32:
-; SI-DAG: buffer_load_dword [[VA:v[0-9]+]]
-; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
-; SI: v_cmp_class_f32_e32 vcc, [[VA]], [[MASK]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.in
-
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32:
-; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
-; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %b = load i32, i32 addrspace(1)* %gep.in
-
-  %result = call i1 @llvm.AMDGPU.class.f32(float 1.0, i32 %b) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
-  ret void
-}
-
-; FIXME: Why isn't this using a literal constant operand?
-; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32:
-; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
-; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
-; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %b = load i32, i32 addrspace(1)* %gep.in
-
-  %result = call i1 @llvm.AMDGPU.class.f32(float 1024.0, i32 %b) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 %b) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_fabs_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
-  %a.fabs = call double @llvm.fabs.f64(double %a) #1
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fabs, i32 %b) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_fneg_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
-  %a.fneg = fsub double -0.0, %a
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg, i32 %b) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_fneg_fabs_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
-  %a.fabs = call double @llvm.fabs.f64(double %a) #1
-  %a.fneg.fabs = fsub double -0.0, %a.fabs
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg.fabs, i32 %b) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_1_f64:
-; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
-; SI: s_endpgm
-define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 1) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_64_f64:
-; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
-; SI: s_endpgm
-define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 64) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; Set all 9 bits of mask
-; SI-LABEL: {{^}}test_class_full_mask_f64:
-; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
-; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}v_test_class_full_mask_f64:
-; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]]
-; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
-; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load double, double addrspace(1)* %in
-
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64:
-; XSI: v_cmp_class_f64_e32 vcc, 1.0,
-; SI: v_cmp_class_f64_e32 vcc,
-; SI: s_endpgm
-define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %b = load i32, i32 addrspace(1)* %gep.in
-
-  %result = call i1 @llvm.AMDGPU.class.f64(double 1.0, i32 %b) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64:
-; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
-; SI: s_endpgm
-define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %b = load i32, i32 addrspace(1)* %gep.in
-
-  %result = call i1 @llvm.AMDGPU.class.f64(double 1024.0, i32 %b) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_fold_or_class_f32_0:
-; SI-NOT: v_cmp_class
-; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
-; SI-NOT: v_cmp_class
-; SI: s_endpgm
-define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.in
-
-  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
-  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1
-  %or = or i1 %class0, %class1
-
-  %sext = sext i1 %or to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_fold_or3_class_f32_0:
-; SI-NOT: v_cmp_class
-; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
-; SI-NOT: v_cmp_class
-; SI: s_endpgm
-define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.in
-
-  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
-  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
-  %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
-  %or.0 = or i1 %class0, %class1
-  %or.1 = or i1 %or.0, %class2
-
-  %sext = sext i1 %or.1 to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0:
-; SI-NOT: v_cmp_class
-; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
-; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}}
-; SI-NOT: v_cmp_class
-; SI: s_endpgm
-define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.in
-
-  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
-  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
-  %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
-  %class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
-  %class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1
-  %class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1
-  %class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
-  %class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1
-  %class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1
-  %class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1
-  %or.0 = or i1 %class0, %class1
-  %or.1 = or i1 %or.0, %class2
-  %or.2 = or i1 %or.1, %class3
-  %or.3 = or i1 %or.2, %class4
-  %or.4 = or i1 %or.3, %class5
-  %or.5 = or i1 %or.4, %class6
-  %or.6 = or i1 %or.5, %class7
-  %or.7 = or i1 %or.6, %class8
-  %or.8 = or i1 %or.7, %class9
-  %sext = sext i1 %or.8 to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_fold_or_class_f32_1:
-; SI-NOT: v_cmp_class
-; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
-; SI-NOT: v_cmp_class
-; SI: s_endpgm
-define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.in
-
-  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
-  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
-  %or = or i1 %class0, %class1
-
-  %sext = sext i1 %or to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_fold_or_class_f32_2:
-; SI-NOT: v_cmp_class
-; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
-; SI-NOT: v_cmp_class
-; SI: s_endpgm
-define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.in
-
-  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
-  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
-  %or = or i1 %class0, %class1
-
-  %sext = sext i1 %or to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_no_fold_or_class_f32_0:
-; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}}
-; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
-; SI: s_or_b64
-; SI: s_endpgm
-define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.in
-
-  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
-  %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1
-  %or = or i1 %class0, %class1
-
-  %sext = sext i1 %or to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_0_f32:
-; SI-NOT: v_cmp_class
-; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_class_0_f64:
-; SI-NOT: v_cmp_class
-; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1
-  %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.AMDGPU.cube.ll b/test/CodeGen/R600/llvm.AMDGPU.cube.ll
deleted file mode 100644
index e95a51093cb..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.cube.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; CHECK: {{^}}cube:
-; CHECK: CUBE T{{[0-9]}}.X
-; CHECK: CUBE T{{[0-9]}}.Y
-; CHECK: CUBE T{{[0-9]}}.Z
-; CHECK: CUBE * T{{[0-9]}}.W
-define void @cube() #0 {
-main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %1 = extractelement <4 x float> %0, i32 3
-  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %3 = extractelement <4 x float> %2, i32 0
-  %4 = fdiv float %3, %1
-  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %6 = extractelement <4 x float> %5, i32 1
-  %7 = fdiv float %6, %1
-  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %9 = extractelement <4 x float> %8, i32 2
-  %10 = fdiv float %9, %1
-  %11 = insertelement <4 x float> undef, float %4, i32 0
-  %12 = insertelement <4 x float> %11, float %7, i32 1
-  %13 = insertelement <4 x float> %12, float %10, i32 2
-  %14 = insertelement <4 x float> %13, float 1.000000e+00, i32 3
-  %15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %14)
-  %16 = extractelement <4 x float> %15, i32 0
-  %17 = extractelement <4 x float> %15, i32 1
-  %18 = extractelement <4 x float> %15, i32 2
-  %19 = extractelement <4 x float> %15, i32 3
-  %20 = call float @fabs(float %18)
-  %21 = fdiv float 1.000000e+00, %20
-  %22 = fmul float %16, %21
-  %23 = fadd float %22, 1.500000e+00
-  %24 = fmul float %17, %21
-  %25 = fadd float %24, 1.500000e+00
-  %26 = insertelement <4 x float> undef, float %25, i32 0
-  %27 = insertelement <4 x float> %26, float %23, i32 1
-  %28 = insertelement <4 x float> %27, float %19, i32 2
-  %29 = insertelement <4 x float> %28, float %25, i32 3
-  %30 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %29, i32 16, i32 0, i32 4)
-  call void @llvm.R600.store.swizzle(<4 x float> %30, i32 0, i32 0)
-  ret void
-}
-
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
-
-; Function Attrs: readnone
-declare float @fabs(float) #1
-
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { readnone }
-
diff --git a/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll b/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
deleted file mode 100644
index 8b32f696449..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.AMDGPU.cvt.f32.ubyte0(i32) nounwind readnone
-declare float @llvm.AMDGPU.cvt.f32.ubyte1(i32) nounwind readnone
-declare float @llvm.AMDGPU.cvt.f32.ubyte2(i32) nounwind readnone
-declare float @llvm.AMDGPU.cvt.f32.ubyte3(i32) nounwind readnone
-
-; SI-LABEL: {{^}}test_unpack_byte0_to_float:
-; SI: v_cvt_f32_ubyte0
-define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte0(i32 %val) nounwind readnone
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_unpack_byte1_to_float:
-; SI: v_cvt_f32_ubyte1
-define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte1(i32 %val) nounwind readnone
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_unpack_byte2_to_float:
-; SI: v_cvt_f32_ubyte2
-define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte2(i32 %val) nounwind readnone
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_unpack_byte3_to_float:
-; SI: v_cvt_f32_ubyte3
-define void @test_unpack_byte3_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte3(i32 %val) nounwind readnone
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
deleted file mode 100644
index 55ca9c7536e..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
-
-declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone
-declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone
-
-; GCN-LABEL: {{^}}test_div_fixup_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
-  %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_div_fixup_f64:
-; GCN: v_div_fixup_f64
-define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
-  %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
-  store double %result, double addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
deleted file mode 100644
index bcb7f870f1f..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
+++ /dev/null
@@ -1,179 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-
-; FIXME: Enable for VI.
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
-declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone
-declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone
-
-; GCN-LABEL: {{^}}test_div_fmas_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0:
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VA]], [[VC]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_div_fmas_f64:
-; GCN: v_div_fmas_f64
-define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
-  %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
-  store double %result, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
-; SI: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}}
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
-  %cmp = icmp eq i32 %i, 0
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
-; SI: s_mov_b64 vcc, 0
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
-; SI: s_mov_b64 vcc, -1
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-
-; SI-DAG: v_cmp_eq_i32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}}
-; SI-DAG: v_cmp_ne_i32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}}
-; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
-; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
-; SI: s_endpgm
-define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
-  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
-
-  %a = load float, float addrspace(1)* %gep.a
-  %b = load float, float addrspace(1)* %gep.b
-  %c = load float, float addrspace(1)* %gep.c
-
-  %cmp0 = icmp eq i32 %tid, 0
-  %cmp1 = icmp ne i32 %d, 0
-  %and = and i1 %cmp0, %cmp1
-
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone
-  store float %result, float addrspace(1)* %gep.out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
-; SI: v_cmp_eq_i32_e32 vcc, 0, v{{[0-9]+}}
-; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]]
-
-; SI: buffer_load_dword [[LOAD:v[0-9]+]]
-; SI: v_cmp_ne_i32_e32 vcc, 0, [[LOAD]]
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-
-
-; SI: BB9_2:
-; SI: s_or_b64 exec, exec, [[SAVE]]
-; SI: v_cmp_ne_i32_e32 vcc, 0, v0
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
-entry:
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
-  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
-  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
-
-  %a = load float, float addrspace(1)* %gep.a
-  %b = load float, float addrspace(1)* %gep.b
-  %c = load float, float addrspace(1)* %gep.c
-
-  %cmp0 = icmp eq i32 %tid, 0
-  br i1 %cmp0, label %bb, label %exit
-
-bb:
-  %val = load i32, i32 addrspace(1)* %dummy
-  %cmp1 = icmp ne i32 %val, 0
-  br label %exit
-
-exit:
-  %cond = phi i1 [false, %entry], [%cmp1, %bb]
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone
-  store float %result, float addrspace(1)* %gep.out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
deleted file mode 100644
index de830de039c..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
+++ /dev/null
@@ -1,364 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone
-declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone
-declare float @llvm.fabs.f32(float) nounwind readnone
-
-; SI-LABEL @test_div_scale_f32_1:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
-; SI: buffer_store_dword [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
-  %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f32_2:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
-; SI: buffer_store_dword [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
-  %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f64_1:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
-; SI: buffer_store_dwordx2 [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
-
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
-  %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f64_1:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
-; SI: buffer_store_dwordx2 [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
-
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
-
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
-  %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f32_scalar_num_1:
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
-; SI-DAG: s_load_dword [[A:s[0-9]+]]
-; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
-; SI: buffer_store_dword [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
-
-  %b = load float, float addrspace(1)* %gep, align 4
-
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
-  %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f32_scalar_num_2:
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
-; SI-DAG: s_load_dword [[A:s[0-9]+]]
-; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
-; SI: buffer_store_dword [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
-
-  %b = load float, float addrspace(1)* %gep, align 4
-
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
-  %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f32_scalar_den_1:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
-; SI-DAG: s_load_dword [[B:s[0-9]+]]
-; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
-; SI: buffer_store_dword [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
-
-  %a = load float, float addrspace(1)* %gep, align 4
-
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
-  %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f32_scalar_den_2:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
-; SI-DAG: s_load_dword [[B:s[0-9]+]]
-; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
-; SI: buffer_store_dword [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
-
-  %a = load float, float addrspace(1)* %gep, align 4
-
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
-  %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f64_scalar_num_1:
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
-; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
-; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
-; SI: buffer_store_dwordx2 [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
-
-  %b = load double, double addrspace(1)* %gep, align 8
-
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
-  %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f64_scalar_num_2:
-; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
-; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
-; SI: buffer_store_dwordx2 [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
-
-  %b = load double, double addrspace(1)* %gep, align 8
-
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
-  %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f64_scalar_den_1:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
-; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
-; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
-; SI: buffer_store_dwordx2 [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
-
-  %a = load double, double addrspace(1)* %gep, align 8
-
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
-  %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f64_scalar_den_2:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
-; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
-; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
-; SI: buffer_store_dwordx2 [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
-
-  %a = load double, double addrspace(1)* %gep, align 8
-
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
-  %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f32_all_scalar_1:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
-; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
-; SI: buffer_store_dword [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
-  %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f32_all_scalar_2:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
-; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
-; SI: buffer_store_dword [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
-  %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f64_all_scalar_1:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
-; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]]
-; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
-; SI: buffer_store_dwordx2 [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
-  %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f64_all_scalar_2:
-; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
-; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]]
-; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
-; SI: buffer_store_dwordx2 [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
-  %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f32_inline_imm_num:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0
-; SI: buffer_store_dword [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %a = load float, float addrspace(1)* %gep.0, align 4
-
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone
-  %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f32_inline_imm_den:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]]
-; SI: buffer_store_dword [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %a = load float, float addrspace(1)* %gep.0, align 4
-
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone
-  %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f32_fabs_num:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]|
-; SI: buffer_store_dword [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
-
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone
-  %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL @test_div_scale_f32_fabs_den:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]]
-; SI: buffer_store_dword [[RESULT0]]
-; SI: s_endpgm
-define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
-
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone
-  %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll b/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll
deleted file mode 100644
index 20c7af8ade5..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.flbit.i32(i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}s_flbit:
-; SI: s_load_dword [[VAL:s[0-9]+]],
-; SI: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]]
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
-; SI: s_endpgm
-define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
-  %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone
-  store i32 %r, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_flbit:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr, align 4
-  %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone
-  store i32 %r, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll
deleted file mode 100644
index e098dd35d6d..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-
-declare double @llvm.fabs.f64(double %Val)
-declare double @llvm.AMDGPU.fract.f64(double) nounwind readnone
-
-; FUNC-LABEL: {{^}}fract_f64:
-; GCN: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
-; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
-; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
-; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
-; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
-; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
-; CI: buffer_store_dwordx2 [[FRC]]
-define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
-  %val = load double, double addrspace(1)* %src, align 4
-  %fract = call double @llvm.AMDGPU.fract.f64(double %val) nounwind readnone
-  store double %fract, double addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f64_neg:
-; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
-; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
-; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
-; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
-; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
-; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
-; CI: buffer_store_dwordx2 [[FRC]]
-define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
-  %val = load double, double addrspace(1)* %src, align 4
-  %neg = fsub double 0.0, %val
-  %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
-  store double %fract, double addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f64_neg_abs:
-; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]|
-; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
-; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
-; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
-; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
-; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
-; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
-; CI: buffer_store_dwordx2 [[FRC]]
-define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
-  %val = load double, double addrspace(1)* %src, align 4
-  %abs = call double @llvm.fabs.f64(double %val)
-  %neg = fsub double 0.0, %abs
-  %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
-  store double %fract, double addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.fract.ll b/test/CodeGen/R600/llvm.AMDGPU.fract.ll
deleted file mode 100644
index 7501b4b7546..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.fract.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.fabs.f32(float  %Val)
-declare float @llvm.AMDGPU.fract.f32(float) nounwind readnone
-
-; Legacy name
-declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}fract_f32:
-; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
-; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
-; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
-; GCN: buffer_store_dword [[RESULT]]
-; EG: FRACT
-define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
-  %val = load float, float addrspace(1)* %src, align 4
-  %fract = call float @llvm.AMDGPU.fract.f32(float %val) nounwind readnone
-  store float %fract, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f32_legacy_amdil:
-; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
-; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
-; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
-; GCN: buffer_store_dword [[RESULT]]
-; EG: FRACT
-define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
-  %val = load float, float addrspace(1)* %src, align 4
-  %fract = call float @llvm.AMDIL.fraction.f32(float %val) nounwind readnone
-  store float %fract, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f32_neg:
-; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]]
-; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -[[INPUT:v[0-9]+]]
-; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT]], [[FLR]]
-; GCN: buffer_store_dword [[RESULT]]
-; EG: FRACT
-define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
-  %val = load float, float addrspace(1)* %src, align 4
-  %neg = fsub float 0.0, %val
-  %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone
-  store float %fract, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f32_neg_abs:
-; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
-; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
-; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT]]|, [[FLR]]
-; GCN: buffer_store_dword [[RESULT]]
-; EG: FRACT
-define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
-  %val = load float, float addrspace(1)* %src, align 4
-  %abs = call float @llvm.fabs.f32(float %val)
-  %neg = fsub float 0.0, %abs
-  %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone
-  store float %fract, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imad24.ll b/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
deleted file mode 100644
index 42102e30f07..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-; FIXME: Store of i32 seems to be broken pre-EG somehow?
-
-declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}test_imad24:
-; SI: v_mad_i32_i24
-; CM: MULADD_INT24
-; R600: MULLO_INT
-; R600: ADD_INT
-define void @test_imad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
-  %mad = call i32 @llvm.AMDGPU.imad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
-  store i32 %mad, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imax.ll b/test/CodeGen/R600/llvm.AMDGPU.imax.ll
deleted file mode 100644
index 46662f96c29..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.imax.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}vector_imax:
-; SI: v_max_i32_e32
-define void @vector_imax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
-main_body:
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %load)
-  %bc = bitcast i32 %max to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; SI-LABEL: {{^}}scalar_imax:
-; SI: s_max_i32
-define void @scalar_imax(i32 %p0, i32 %p1) #0 {
-entry:
-  %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %p1)
-  %bc = bitcast i32 %max to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; Function Attrs: readnone
-declare i32 @llvm.AMDGPU.imax(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imin.ll b/test/CodeGen/R600/llvm.AMDGPU.imin.ll
deleted file mode 100644
index 34b454e2375..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.imin.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}vector_imin:
-; SI: v_min_i32_e32
-define void @vector_imin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
-main_body:
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %load)
-  %bc = bitcast i32 %min to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; SI-LABEL: {{^}}scalar_imin:
-; SI: s_min_i32
-define void @scalar_imin(i32 %p0, i32 %p1) #0 {
-entry:
-  %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %p1)
-  %bc = bitcast i32 %min to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; Function Attrs: readnone
-declare i32 @llvm.AMDGPU.imin(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imul24.ll b/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
deleted file mode 100644
index fdc1172260b..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}test_imul24:
-; SI: v_mul_i32_i24
-; CM: MUL_INT24
-; R600: MULLO_INT
-define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
-  store i32 %mul, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.kill.ll b/test/CodeGen/R600/llvm.AMDGPU.kill.ll
deleted file mode 100644
index 057708e7b5c..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.kill.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}kill_gs_const:
-; SI-NOT: v_cmpx_le_f32
-; SI: s_mov_b64 exec, 0
-
-define void @kill_gs_const() #0 {
-main_body:
-  %0 = icmp ule i32 0, 3
-  %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kill(float %1)
-  %2 = icmp ule i32 3, 0
-  %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kill(float %3)
-  ret void
-}
-
-; SI-LABEL: {{^}}kill_vcc_implicit_def:
-; SI-NOT: v_cmp_gt_f32_e32 vcc,
-; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
-; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
-define void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #1 {
-entry:
-  %tmp0 = fcmp olt float %13, 0.0
-  call void @llvm.AMDGPU.kill(float %14)
-  %tmp1 = select i1 %tmp0, float 1.0, float 0.0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1)
-  ret void
-}
-
-declare void @llvm.AMDGPU.kill(float)
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="2" }
-attributes #1 = { "ShaderType"="0" }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll b/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll
deleted file mode 100644
index a59c0ce6d67..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.AMDGPU.ldexp.f32(float, i32) nounwind readnone
-declare double @llvm.AMDGPU.ldexp.f64(double, i32) nounwind readnone
-
-; SI-LABEL: {{^}}test_ldexp_f32:
-; SI: v_ldexp_f32
-; SI: s_endpgm
-define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
-  %result = call float @llvm.AMDGPU.ldexp.f32(float %a, i32 %b) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_ldexp_f64:
-; SI: v_ldexp_f64
-; SI: s_endpgm
-define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
-  %result = call double @llvm.AMDGPU.ldexp.f64(double %a, i32 %b) nounwind readnone
-  store double %result, double addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll b/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll
deleted file mode 100644
index 4cafd563685..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}rsq_legacy_f32:
-; SI: v_rsq_legacy_f32_e32
-; EG: RECIPSQRT_IEEE
-define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
-  %rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone
-  store float %rsq, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.mul.ll b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
deleted file mode 100644
index 83b56a5029d..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.mul.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @test(<4 x float> inreg %reg0) #0 {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = extractelement <4 x float> %reg0, i32 1
-   %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1)
-   %vec = insertelement <4 x float> undef, float %r2, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
-   ret void
-}
-
-declare float @llvm.AMDGPU.mul(float ,float ) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll
deleted file mode 100644
index d2a655bf909..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
-declare double @llvm.sqrt.f64(double) nounwind readnone
-
-; FUNC-LABEL: {{^}}rcp_f64:
-; SI: v_rcp_f64_e32
-define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind {
-  %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone
-  store double %rcp, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rcp_pat_f64:
-; SI: v_rcp_f64_e32
-define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
-  %rcp = fdiv double 1.0, %src
-  store double %rcp, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rsq_rcp_pat_f64:
-; SI-UNSAFE: v_rsq_f64_e32
-; SI-SAFE-NOT: v_rsq_f64_e32
-; SI-SAFE: v_sqrt_f64
-; SI-SAFE: v_rcp_f64
-define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
-  %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone
-  %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone
-  store double %rcp, double addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rcp.ll b/test/CodeGen/R600/llvm.AMDGPU.rcp.ll
deleted file mode 100644
index edd6e9a72f1..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.rcp.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
-
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone
-declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
-
-declare float @llvm.sqrt.f32(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}rcp_f32:
-; SI: v_rcp_f32_e32
-; EG: RECIP_IEEE
-define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind {
-  %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone
-  store float %rcp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FIXME: Evergreen only ever does unsafe fp math.
-; FUNC-LABEL: {{^}}rcp_pat_f32:
-
-; SI-SAFE: v_rcp_f32_e32
-; XSI-SAFE-SPDENORM-NOT: v_rcp_f32_e32
-
-; EG: RECIP_IEEE
-
-define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
-  %rcp = fdiv float 1.0, %src
-  store float %rcp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rsq_rcp_pat_f32:
-; SI-UNSAFE: v_rsq_f32_e32
-; SI-SAFE: v_sqrt_f32_e32
-; SI-SAFE: v_rcp_f32_e32
-
-; EG: RECIPSQRT_IEEE
-define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
-  %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone
-  %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone
-  store float %rcp, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
deleted file mode 100644
index 67f1d22c717..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
-
-declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
-
-; FUNC-LABEL: {{^}}rsq_clamped_f64:
-; SI: v_rsq_clamp_f64_e32
-
-; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3]
-; TODO: this constant should be folded:
-; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
-; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
-; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
-; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
-; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
-; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
-; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
-
-define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
-  %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
-  store double %rsq_clamped, double addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
deleted file mode 100644
index eeff2536b23..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}rsq_clamped_f32:
-; SI: v_rsq_clamp_f32_e32
-
-; VI: v_rsq_f32_e32 [[RSQ:v[0-9]+]], {{s[0-9]+}}
-; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
-; TODO: this constant should be folded:
-; VI: v_mov_b32_e32 [[MINFLT:v[0-9]+]], 0xff7fffff
-; VI: v_max_f32_e32 {{v[0-9]+}}, [[MIN]], [[MINFLT]]
-
-; EG: RECIPSQRT_CLAMPED
-
-define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
-  %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone
-  store float %rsq_clamped, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.ll
deleted file mode 100644
index 36b72f14db1..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.rsq.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}rsq_f32:
-; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-; EG: RECIPSQRT_IEEE
-define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind {
-  %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone
-  store float %rsq, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; TODO: Really these should be constant folded
-; FUNC-LABEL: {{^}}rsq_f32_constant_4.0
-; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0
-; EG: RECIPSQRT_IEEE
-define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind {
-  %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone
-  store float %rsq, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rsq_f32_constant_100.0
-; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000
-; EG: RECIPSQRT_IEEE
-define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind {
-  %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone
-  store float %rsq, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.tex.ll b/test/CodeGen/R600/llvm.AMDGPU.tex.ll
deleted file mode 100644
index 10206609bb5..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.tex.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
-
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-   %addr = load <4 x float>, <4 x float> addrspace(1)* %in
-   %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %addr, i32 0, i32 0, i32 1)
-   %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res1, i32 0, i32 0, i32 2)
-   %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res2, i32 0, i32 0, i32 3)
-   %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res3, i32 0, i32 0, i32 4)
-   %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res4, i32 0, i32 0, i32 5)
-   %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res5, i32 0, i32 0, i32 6)
-   %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res6, i32 0, i32 0, i32 7)
-   %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res7, i32 0, i32 0, i32 8)
-   %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res8, i32 0, i32 0, i32 9)
-   %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res9, i32 0, i32 0, i32 10)
-   %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res10, i32 0, i32 0, i32 11)
-   %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res11, i32 0, i32 0, i32 12)
-   %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res12, i32 0, i32 0, i32 13)
-   %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res13, i32 0, i32 0, i32 14)
-   %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res14, i32 0, i32 0, i32 15)
-   %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res15, i32 0, i32 0, i32 16)
-   store <4 x float> %res16, <4 x float> addrspace(1)* %out
-   ret void
-}
-
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll b/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
deleted file mode 100644
index 6b546a7e17c..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone
-
-; SI-LABEL: {{^}}test_trig_preop_f64:
-; SI-DAG: buffer_load_dword [[SEG:v[0-9]+]]
-; SI-DAG: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]],
-; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
-; SI: buffer_store_dwordx2 [[RESULT]],
-; SI: s_endpgm
-define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load double, double addrspace(1)* %aptr, align 8
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
-  %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone
-  store double %result, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}test_trig_preop_f64_imm_segment:
-; SI: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]],
-; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
-; SI: buffer_store_dwordx2 [[RESULT]],
-; SI: s_endpgm
-define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
-  %a = load double, double addrspace(1)* %aptr, align 8
-  %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone
-  store double %result, double addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
deleted file mode 100644
index 74792e50017..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
-
-; R600: {{^}}amdgpu_trunc:
-; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI: {{^}}amdgpu_trunc:
-; SI: v_trunc_f32
-
-define void @amdgpu_trunc(float addrspace(1)* %out, float %x) {
-entry:
-  %0 = call float @llvm.AMDGPU.trunc(float %x)
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-declare float @llvm.AMDGPU.trunc(float ) readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
deleted file mode 100644
index 77a073b0cb0..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; FUNC-LABEL: {{^}}test_umad24:
-; SI: v_mad_u32_u24
-; EG: MULADD_UINT24
-; R600: MULLO_UINT
-; R600: ADD_INT
-define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
-  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
-  store i32 %mad, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}commute_umad24:
-; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]]
-; SI: buffer_store_dword [[RESULT]]
-define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %src0.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %src2.gep = getelementptr i32, i32 addrspace(1)* %src0.gep, i32 1
-
-  %src0 = load i32, i32 addrspace(1)* %src0.gep, align 4
-  %src2 = load i32, i32 addrspace(1)* %src2.gep, align 4
-  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone
-  store i32 %mad, i32 addrspace(1)* %out.gep, align 4
-  ret void
-}
-
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umax.ll b/test/CodeGen/R600/llvm.AMDGPU.umax.ll
deleted file mode 100644
index a97d103016d..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.umax.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}vector_umax:
-; SI: v_max_u32_e32
-define void @vector_umax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
-main_body:
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %load)
-  %bc = bitcast i32 %max to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; SI-LABEL: {{^}}scalar_umax:
-; SI: s_max_u32
-define void @scalar_umax(i32 %p0, i32 %p1) #0 {
-entry:
-  %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %p1)
-  %bc = bitcast i32 %max to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; SI-LABEL: {{^}}trunc_zext_umax:
-; SI: buffer_load_ubyte [[VREG:v[0-9]+]],
-; SI: v_max_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
-; SI-NOT: and
-; SI: buffer_store_short [[RESULT]],
-define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
-  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
-  %tmp2 = zext i8 %tmp5 to i32
-  %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone
-  %tmp4 = trunc i32 %tmp3 to i8
-  %tmp6 = zext i8 %tmp4 to i16
-  store i16 %tmp6, i16 addrspace(1)* %out, align 2
-  ret void
-}
-
-; Function Attrs: readnone
-declare i32 @llvm.AMDGPU.umax(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umin.ll b/test/CodeGen/R600/llvm.AMDGPU.umin.ll
deleted file mode 100644
index 2acd10e0c63..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.umin.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}vector_umin:
-; SI: v_min_u32_e32
-define void @vector_umin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
-main_body:
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %load)
-  %bc = bitcast i32 %min to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; SI-LABEL: {{^}}scalar_umin:
-; SI: s_min_u32
-define void @scalar_umin(i32 %p0, i32 %p1) #0 {
-entry:
-  %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %p1)
-  %bc = bitcast i32 %min to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; SI-LABEL: {{^}}trunc_zext_umin:
-; SI: buffer_load_ubyte [[VREG:v[0-9]+]],
-; SI: v_min_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
-; SI-NOT: and
-; SI: buffer_store_short [[RESULT]],
-define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
-  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
-  %tmp2 = zext i8 %tmp5 to i32
-  %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone
-  %tmp4 = trunc i32 %tmp3 to i8
-  %tmp6 = zext i8 %tmp4 to i16
-  store i16 %tmp6, i16 addrspace(1)* %out, align 2
-  ret void
-}
-
-; Function Attrs: readnone
-declare i32 @llvm.AMDGPU.umin(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umul24.ll b/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
deleted file mode 100644
index 76624a078b3..00000000000
--- a/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.umul24(i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}test_umul24:
-; SI: v_mul_u32_u24
-; R600: MUL_UINT24
-; R600: MULLO_UINT
-define void @test_umul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %mul = call i32 @llvm.AMDGPU.umul24(i32 %src0, i32 %src1) nounwind readnone
-  store i32 %mul, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.ll b/test/CodeGen/R600/llvm.SI.fs.interp.ll
deleted file mode 100644
index 3d05da616e4..00000000000
--- a/test/CodeGen/R600/llvm.SI.fs.interp.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-
-;GCN-LABEL: {{^}}main:
-;GCN-NOT: s_wqm
-;GCN: s_mov_b32
-;GCN-NEXT: v_interp_mov_f32
-;GCN: v_interp_p1_f32
-;GCN: v_interp_p2_f32
-
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 {
-main_body:
-  %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
-  %6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4)
-  %7 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %4)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %6, float %7, float %7)
-  ret void
-}
-
-; Thest that v_interp_p1 uses different source and destination registers
-; on 16 bank LDS chips.
-
-; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug:
-; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]]
-
-define void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
-main_body:
-  %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7)
-  %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
-  %24 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %5, <2 x i32> %7)
-  %25 = call float @fabs(float %22)
-  %26 = call float @fabs(float %23)
-  %27 = call float @fabs(float %24)
-  %28 = call i32 @llvm.SI.packf16(float %25, float %26)
-  %29 = bitcast i32 %28 to float
-  %30 = call i32 @llvm.SI.packf16(float %27, float 1.000000e+00)
-  %31 = bitcast i32 %30 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %29, float %31, float %29, float %31)
-  ret void
-}
-
-; Function Attrs: readnone
-declare float @fabs(float) #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.constant(i32, i32, i32) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
diff --git a/test/CodeGen/R600/llvm.SI.gather4.ll b/test/CodeGen/R600/llvm.SI.gather4.ll
deleted file mode 100644
index 275cb580bc9..00000000000
--- a/test/CodeGen/R600/llvm.SI.gather4.ll
+++ /dev/null
@@ -1,509 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}gather4_v2:
-;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_v2() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4:
-;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_cl:
-;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_l:
-;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_l() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b:
-;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_cl:
-;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_cl_v8:
-;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_cl_v8() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_lz_v2:
-;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_lz_v2() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_lz:
-;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_lz() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-
-;CHECK-LABEL: {{^}}gather4_o:
-;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_o() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_cl_o:
-;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_cl_o() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_cl_o_v8:
-;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_cl_o_v8() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_l_o:
-;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_l_o() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_l_o_v8:
-;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_l_o_v8() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_o:
-;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_o() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_o_v8:
-;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_o_v8() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_cl_o:
-;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_cl_o() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_lz_o:
-;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_lz_o() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-
-;CHECK-LABEL: {{^}}gather4_c:
-;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_cl:
-;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_cl_v8:
-;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_cl_v8() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_l:
-;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_l() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_l_v8:
-;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_l_v8() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b:
-;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b_v8:
-;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b_v8() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b_cl:
-;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_lz:
-;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_lz() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-
-;CHECK-LABEL: {{^}}gather4_c_o:
-;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_o() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_o_v8:
-;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_o_v8() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_cl_o:
-;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_cl_o() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_l_o:
-;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_l_o() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b_o:
-;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b_o() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b_cl_o:
-;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b_cl_o() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_lz_o:
-;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_lz_o() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_lz_o_v8:
-;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_lz_o_v8() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-
-declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.SI.getlod.ll b/test/CodeGen/R600/llvm.SI.getlod.ll
deleted file mode 100644
index 06ee98e91b3..00000000000
--- a/test/CodeGen/R600/llvm.SI.getlod.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}getlod:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @getlod() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}getlod_v2:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @getlod_v2() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}getlod_v4:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @getlod_v4() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
-  ret void
-}
-
-
-declare <4 x float> @llvm.SI.getlod.i32(i32, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.SI.image.ll b/test/CodeGen/R600/llvm.SI.image.ll
deleted file mode 100644
index 0fac8d79956..00000000000
--- a/test/CodeGen/R600/llvm.SI.image.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}image_load:
-;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @image_load() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}image_load_mip:
-;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @image_load_mip() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}getresinfo:
-;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @getresinfo() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.SI.image.sample.ll b/test/CodeGen/R600/llvm.SI.image.sample.ll
deleted file mode 100644
index 4bc638a2806..00000000000
--- a/test/CodeGen/R600/llvm.SI.image.sample.ll
+++ /dev/null
@@ -1,310 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}sample:
-;CHECK: s_wqm
-;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_d:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_d() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_d_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_d_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_l:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_l() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_b:
-;CHECK: s_wqm
-;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_b() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_b_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_b_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_lz:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_lz() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cd:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cd() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cd_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cd_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c:
-;CHECK: s_wqm
-;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_d:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_d() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_d_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_d_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_l:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_l() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_b:
-;CHECK: s_wqm
-;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_b() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_b_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_b_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_lz:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_lz() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cd:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cd() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cd_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cd_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.SI.image.sample.o.ll b/test/CodeGen/R600/llvm.SI.image.sample.o.ll
deleted file mode 100644
index 9d8935414ed..00000000000
--- a/test/CodeGen/R600/llvm.SI.image.sample.o.ll
+++ /dev/null
@@ -1,310 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}sample:
-;CHECK: s_wqm
-;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_d:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_d() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_d_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_d_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_l:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_l() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_b:
-;CHECK: s_wqm
-;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_b() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_b_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_b_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_lz:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_lz() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cd:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cd() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cd_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cd_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c:
-;CHECK: s_wqm
-;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_d:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_d() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_d_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_d_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_l:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_l() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_b:
-;CHECK: s_wqm
-;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_b() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_b_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_b_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_lz:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_lz() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cd:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cd() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cd_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cd_cl() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.SI.imageload.ll b/test/CodeGen/R600/llvm.SI.imageload.ll
deleted file mode 100644
index b67716c3b66..00000000000
--- a/test/CodeGen/R600/llvm.SI.imageload.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-DAG: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
-;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 2, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 1, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 4, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
-;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, -1
-
-define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
-   %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
-   %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
-   %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
-   %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
-   %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
-   %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
-   %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
-   %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
-   %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
-   %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
-   %res1 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v1,
-      <32 x i8> undef, i32 1)
-   %res2 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v2,
-      <32 x i8> undef, i32 2)
-   %res3 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v3,
-      <32 x i8> undef, i32 3)
-   %res4 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v4,
-      <32 x i8> undef, i32 4)
-   %res5 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v5,
-      <32 x i8> undef, i32 5)
-   %res6 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v6,
-      <32 x i8> undef, i32 6)
-   %res10 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v10,
-      <32 x i8> undef, i32 10)
-   %res11 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v11,
-      <32 x i8> undef, i32 11)
-   %res15 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v15,
-      <32 x i8> undef, i32 15)
-   %res16 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v16,
-      <32 x i8> undef, i32 16)
-   %e1 = extractelement <4 x i32> %res1, i32 0
-   %e2 = extractelement <4 x i32> %res2, i32 1
-   %e3 = extractelement <4 x i32> %res3, i32 2
-   %e4 = extractelement <4 x i32> %res4, i32 3
-   %t0 = extractelement <4 x i32> %res5, i32 0
-   %t1 = extractelement <4 x i32> %res5, i32 1
-   %e5 = add i32 %t0, %t1
-   %t2 = extractelement <4 x i32> %res6, i32 0
-   %t3 = extractelement <4 x i32> %res6, i32 2
-   %e6 = add i32 %t2, %t3
-   %t10 = extractelement <4 x i32> %res10, i32 2
-   %t11 = extractelement <4 x i32> %res10, i32 3
-   %e10 = add i32 %t10, %t11
-   %t12 = extractelement <4 x i32> %res11, i32 0
-   %t13 = extractelement <4 x i32> %res11, i32 1
-   %t14 = extractelement <4 x i32> %res11, i32 2
-   %t15 = add i32 %t12, %t13
-   %e11 = add i32 %t14, %t15
-   %t28 = extractelement <4 x i32> %res15, i32 0
-   %t29 = extractelement <4 x i32> %res15, i32 1
-   %t30 = extractelement <4 x i32> %res15, i32 2
-   %t31 = extractelement <4 x i32> %res15, i32 3
-   %t32 = add i32 %t28, %t29
-   %t33 = add i32 %t30, %t31
-   %e15 = add i32 %t32, %t33
-   %e16 = extractelement <4 x i32> %res16, i32 3
-   %s1 = add i32 %e1, %e2
-   %s2 = add i32 %s1, %e3
-   %s3 = add i32 %s2, %e4
-   %s4 = add i32 %s3, %e5
-   %s5 = add i32 %s4, %e6
-   %s9 = add i32 %s5, %e10
-   %s10 = add i32 %s9, %e11
-   %s14 = add i32 %s10, %e15
-   %s15 = add i32 %s14, %e16
-   %s16 = bitcast i32 %s15 to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16)
-   ret void
-}
-
-; Test that ccordinates are stored in vgprs and not sgprs
-; CHECK: vgpr_coords
-; CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}
-define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-main_body:
-  %20 = getelementptr float addrspace(2)*, float addrspace(2)* addrspace(2)* %0, i32 0
-  %21 = load float addrspace(2)*, float addrspace(2)* addrspace(2)* %20, !tbaa !2
-  %22 = getelementptr float, float addrspace(2)* %21, i32 0
-  %23 = load float, float addrspace(2)* %22, !tbaa !2, !invariant.load !1
-  %24 = getelementptr float, float addrspace(2)* %21, i32 1
-  %25 = load float, float addrspace(2)* %24, !tbaa !2, !invariant.load !1
-  %26 = getelementptr float, float addrspace(2)* %21, i32 4
-  %27 = load float, float addrspace(2)* %26, !tbaa !2, !invariant.load !1
-  %28 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
-  %29 = load <32 x i8>, <32 x i8> addrspace(2)* %28, !tbaa !2
-  %30 = bitcast float %27 to i32
-  %31 = bitcast float %23 to i32
-  %32 = bitcast float %25 to i32
-  %33 = insertelement <4 x i32> undef, i32 %31, i32 0
-  %34 = insertelement <4 x i32> %33, i32 %32, i32 1
-  %35 = insertelement <4 x i32> %34, i32 %30, i32 2
-  %36 = insertelement <4 x i32> %35, i32 undef, i32 3
-  %37 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> %36, <32 x i8> %29, i32 2)
-  %38 = extractelement <4 x i32> %37, i32 0
-  %39 = extractelement <4 x i32> %37, i32 1
-  %40 = extractelement <4 x i32> %37, i32 2
-  %41 = extractelement <4 x i32> %37, i32 3
-  %42 = bitcast i32 %38 to float
-  %43 = bitcast i32 %39 to float
-  %44 = bitcast i32 %40 to float
-  %45 = bitcast i32 %41 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %42, float %43, float %44, float %45)
-  ret void
-}
-
-declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <32 x i8>, i32) readnone
-; Function Attrs: nounwind readnone
-declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null}
-!1 = !{}
-!2 = !{!0, !0, i64 0, i32 1}
diff --git a/test/CodeGen/R600/llvm.SI.load.dword.ll b/test/CodeGen/R600/llvm.SI.load.dword.ll
deleted file mode 100644
index f6c258539d5..00000000000
--- a/test/CodeGen/R600/llvm.SI.load.dword.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
-
-; Example of a simple geometry shader loading vertex attributes from the
-; ESGS ring buffer
-
-; FIXME: Out of bounds immediate offset crashes
-
-; CHECK-LABEL: {{^}}main:
-; CHECK: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc
-; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc
-; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc
-; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc
-; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding
-; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc
-
-define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 {
-main_body:
-  %tmp = getelementptr [2 x <16 x i8>], [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1
-  %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
-  %tmp11 = shl i32 %arg6, 2
-  %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0)
-  %tmp13 = bitcast i32 %tmp12 to float
-  %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0)
-  %tmp15 = bitcast i32 %tmp14 to float
-  %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0)
-  %tmp17 = bitcast i32 %tmp16 to float
-  %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0)
-  %tmp19 = bitcast i32 %tmp18 to float
-
-  %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0)
-  %tmp21 = bitcast i32 %tmp20 to float
-
-  %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0)
-  %tmp23 = bitcast i32 %tmp22 to float
-
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp13, float %tmp15, float %tmp17, float %tmp19)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp21, float %tmp23, float %tmp23, float %tmp23)
-  ret void
-}
-
-; Function Attrs: nounwind readonly
-declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-; Function Attrs: nounwind readonly
-declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { nounwind readonly }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.SI.resinfo.ll b/test/CodeGen/R600/llvm.SI.resinfo.ll
deleted file mode 100644
index ac95fd0b83a..00000000000
--- a/test/CodeGen/R600/llvm.SI.resinfo.ll
+++ /dev/null
@@ -1,111 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 2, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 1, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 4, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, -1
-
-define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8,
-		  i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) {
-   %res1 = call <4 x i32> @llvm.SI.resinfo(i32 %a1, <32 x i8> undef, i32 1)
-   %res2 = call <4 x i32> @llvm.SI.resinfo(i32 %a2, <32 x i8> undef, i32 2)
-   %res3 = call <4 x i32> @llvm.SI.resinfo(i32 %a3, <32 x i8> undef, i32 3)
-   %res4 = call <4 x i32> @llvm.SI.resinfo(i32 %a4, <32 x i8> undef, i32 4)
-   %res5 = call <4 x i32> @llvm.SI.resinfo(i32 %a5, <32 x i8> undef, i32 5)
-   %res6 = call <4 x i32> @llvm.SI.resinfo(i32 %a6, <32 x i8> undef, i32 6)
-   %res7 = call <4 x i32> @llvm.SI.resinfo(i32 %a7, <32 x i8> undef, i32 7)
-   %res8 = call <4 x i32> @llvm.SI.resinfo(i32 %a8, <32 x i8> undef, i32 8)
-   %res9 = call <4 x i32> @llvm.SI.resinfo(i32 %a9, <32 x i8> undef, i32 9)
-   %res10 = call <4 x i32> @llvm.SI.resinfo(i32 %a10, <32 x i8> undef, i32 10)
-   %res11 = call <4 x i32> @llvm.SI.resinfo(i32 %a11, <32 x i8> undef, i32 11)
-   %res12 = call <4 x i32> @llvm.SI.resinfo(i32 %a12, <32 x i8> undef, i32 12)
-   %res13 = call <4 x i32> @llvm.SI.resinfo(i32 %a13, <32 x i8> undef, i32 13)
-   %res14 = call <4 x i32> @llvm.SI.resinfo(i32 %a14, <32 x i8> undef, i32 14)
-   %res15 = call <4 x i32> @llvm.SI.resinfo(i32 %a15, <32 x i8> undef, i32 15)
-   %res16 = call <4 x i32> @llvm.SI.resinfo(i32 %a16, <32 x i8> undef, i32 16)
-   %e1 = extractelement <4 x i32> %res1, i32 0
-   %e2 = extractelement <4 x i32> %res2, i32 1
-   %e3 = extractelement <4 x i32> %res3, i32 2
-   %e4 = extractelement <4 x i32> %res4, i32 3
-   %t0 = extractelement <4 x i32> %res5, i32 0
-   %t1 = extractelement <4 x i32> %res5, i32 1
-   %e5 = add i32 %t0, %t1
-   %t2 = extractelement <4 x i32> %res6, i32 0
-   %t3 = extractelement <4 x i32> %res6, i32 2
-   %e6 = add i32 %t2, %t3
-   %t4 = extractelement <4 x i32> %res7, i32 0
-   %t5 = extractelement <4 x i32> %res7, i32 3
-   %e7 = add i32 %t4, %t5
-   %t6 = extractelement <4 x i32> %res8, i32 1
-   %t7 = extractelement <4 x i32> %res8, i32 2
-   %e8 = add i32 %t6, %t7
-   %t8 = extractelement <4 x i32> %res9, i32 1
-   %t9 = extractelement <4 x i32> %res9, i32 3
-   %e9 = add i32 %t8, %t9
-   %t10 = extractelement <4 x i32> %res10, i32 2
-   %t11 = extractelement <4 x i32> %res10, i32 3
-   %e10 = add i32 %t10, %t11
-   %t12 = extractelement <4 x i32> %res11, i32 0
-   %t13 = extractelement <4 x i32> %res11, i32 1
-   %t14 = extractelement <4 x i32> %res11, i32 2
-   %t15 = add i32 %t12, %t13
-   %e11 = add i32 %t14, %t15
-   %t16 = extractelement <4 x i32> %res12, i32 0
-   %t17 = extractelement <4 x i32> %res12, i32 1
-   %t18 = extractelement <4 x i32> %res12, i32 3
-   %t19 = add i32 %t16, %t17
-   %e12 = add i32 %t18, %t19
-   %t20 = extractelement <4 x i32> %res13, i32 0
-   %t21 = extractelement <4 x i32> %res13, i32 2
-   %t22 = extractelement <4 x i32> %res13, i32 3
-   %t23 = add i32 %t20, %t21
-   %e13 = add i32 %t22, %t23
-   %t24 = extractelement <4 x i32> %res14, i32 1
-   %t25 = extractelement <4 x i32> %res14, i32 2
-   %t26 = extractelement <4 x i32> %res14, i32 3
-   %t27 = add i32 %t24, %t25
-   %e14 = add i32 %t26, %t27
-   %t28 = extractelement <4 x i32> %res15, i32 0
-   %t29 = extractelement <4 x i32> %res15, i32 1
-   %t30 = extractelement <4 x i32> %res15, i32 2
-   %t31 = extractelement <4 x i32> %res15, i32 3
-   %t32 = add i32 %t28, %t29
-   %t33 = add i32 %t30, %t31
-   %e15 = add i32 %t32, %t33
-   %e16 = extractelement <4 x i32> %res16, i32 3
-   %s1 = add i32 %e1, %e2
-   %s2 = add i32 %s1, %e3
-   %s3 = add i32 %s2, %e4
-   %s4 = add i32 %s3, %e5
-   %s5 = add i32 %s4, %e6
-   %s6 = add i32 %s5, %e7
-   %s7 = add i32 %s6, %e8
-   %s8 = add i32 %s7, %e9
-   %s9 = add i32 %s8, %e10
-   %s10 = add i32 %s9, %e11
-   %s11 = add i32 %s10, %e12
-   %s12 = add i32 %s11, %e13
-   %s13 = add i32 %s12, %e14
-   %s14 = add i32 %s13, %e15
-   %s15 = add i32 %s14, %e16
-   %s16 = bitcast i32 %s15 to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16)
-   ret void
-}
-
-declare <4 x i32> @llvm.SI.resinfo(i32, <32 x i8>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.SI.sample-masked.ll b/test/CodeGen/R600/llvm.SI.sample-masked.ll
deleted file mode 100644
index ce9558cbf81..00000000000
--- a/test/CodeGen/R600/llvm.SI.sample-masked.ll
+++ /dev/null
@@ -1,96 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s
-
-; CHECK-LABEL: {{^}}v1:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 13
-define void @v1(i32 %a1) #0 {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
-  %2 = extractelement <4 x float> %1, i32 0
-  %3 = extractelement <4 x float> %1, i32 2
-  %4 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v2:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 11
-define void @v2(i32 %a1) #0 {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
-  %2 = extractelement <4 x float> %1, i32 0
-  %3 = extractelement <4 x float> %1, i32 1
-  %4 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v3:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14
-define void @v3(i32 %a1) #0 {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
-  %2 = extractelement <4 x float> %1, i32 1
-  %3 = extractelement <4 x float> %1, i32 2
-  %4 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v4:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 7
-define void @v4(i32 %a1) #0 {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
-  %2 = extractelement <4 x float> %1, i32 0
-  %3 = extractelement <4 x float> %1, i32 1
-  %4 = extractelement <4 x float> %1, i32 2
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v5:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10
-define void @v5(i32 %a1) #0 {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
-  %2 = extractelement <4 x float> %1, i32 1
-  %3 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v6:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 6
-define void @v6(i32 %a1) #0 {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
-  %2 = extractelement <4 x float> %1, i32 1
-  %3 = extractelement <4 x float> %1, i32 2
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v7:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 9
-define void @v7(i32 %a1) #0 {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
-  %2 = extractelement <4 x float> %1, i32 0
-  %3 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
-  ret void
-}
-
-declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.SI.sample.ll b/test/CodeGen/R600/llvm.SI.sample.ll
deleted file mode 100644
index 509c45f588b..00000000000
--- a/test/CodeGen/R600/llvm.SI.sample.ll
+++ /dev/null
@@ -1,160 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 3
-;CHECK-DAG: image_sample {{v[0-9]+}}, 2
-;CHECK-DAG: image_sample {{v[0-9]+}}, 1
-;CHECK-DAG: image_sample {{v[0-9]+}}, 4
-;CHECK-DAG: image_sample {{v[0-9]+}}, 8
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 5
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 9
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 6
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 12
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 7
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 11
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 13
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14
-;CHECK-DAG: image_sample {{v[0-9]+}}, 8
-
-define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
-   %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
-   %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
-   %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
-   %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
-   %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
-   %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
-   %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2
-   %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3
-   %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0
-   %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
-   %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
-   %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3
-   %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0
-   %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1
-   %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
-   %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
-   %res1 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v1,
-      <32 x i8> undef, <16 x i8> undef, i32 1)
-   %res2 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v2,
-      <32 x i8> undef, <16 x i8> undef, i32 2)
-   %res3 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v3,
-      <32 x i8> undef, <16 x i8> undef, i32 3)
-   %res4 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v4,
-      <32 x i8> undef, <16 x i8> undef, i32 4)
-   %res5 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v5,
-      <32 x i8> undef, <16 x i8> undef, i32 5)
-   %res6 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v6,
-      <32 x i8> undef, <16 x i8> undef, i32 6)
-   %res7 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v7,
-      <32 x i8> undef, <16 x i8> undef, i32 7)
-   %res8 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v8,
-      <32 x i8> undef, <16 x i8> undef, i32 8)
-   %res9 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v9,
-      <32 x i8> undef, <16 x i8> undef, i32 9)
-   %res10 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v10,
-      <32 x i8> undef, <16 x i8> undef, i32 10)
-   %res11 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v11,
-      <32 x i8> undef, <16 x i8> undef, i32 11)
-   %res12 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v12,
-      <32 x i8> undef, <16 x i8> undef, i32 12)
-   %res13 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v13,
-      <32 x i8> undef, <16 x i8> undef, i32 13)
-   %res14 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v14,
-      <32 x i8> undef, <16 x i8> undef, i32 14)
-   %res15 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v15,
-      <32 x i8> undef, <16 x i8> undef, i32 15)
-   %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16,
-      <32 x i8> undef, <16 x i8> undef, i32 16)
-   %e1 = extractelement <4 x float> %res1, i32 0
-   %e2 = extractelement <4 x float> %res2, i32 1
-   %e3 = extractelement <4 x float> %res3, i32 2
-   %e4 = extractelement <4 x float> %res4, i32 3
-   %t0 = extractelement <4 x float> %res5, i32 0
-   %t1 = extractelement <4 x float> %res5, i32 1
-   %e5 = fadd float %t0, %t1
-   %t2 = extractelement <4 x float> %res6, i32 0
-   %t3 = extractelement <4 x float> %res6, i32 2
-   %e6 = fadd float %t2, %t3
-   %t4 = extractelement <4 x float> %res7, i32 0
-   %t5 = extractelement <4 x float> %res7, i32 3
-   %e7 = fadd float %t4, %t5
-   %t6 = extractelement <4 x float> %res8, i32 1
-   %t7 = extractelement <4 x float> %res8, i32 2
-   %e8 = fadd float %t6, %t7
-   %t8 = extractelement <4 x float> %res9, i32 1
-   %t9 = extractelement <4 x float> %res9, i32 3
-   %e9 = fadd float %t8, %t9
-   %t10 = extractelement <4 x float> %res10, i32 2
-   %t11 = extractelement <4 x float> %res10, i32 3
-   %e10 = fadd float %t10, %t11
-   %t12 = extractelement <4 x float> %res11, i32 0
-   %t13 = extractelement <4 x float> %res11, i32 1
-   %t14 = extractelement <4 x float> %res11, i32 2
-   %t15 = fadd float %t12, %t13
-   %e11 = fadd float %t14, %t15
-   %t16 = extractelement <4 x float> %res12, i32 0
-   %t17 = extractelement <4 x float> %res12, i32 1
-   %t18 = extractelement <4 x float> %res12, i32 3
-   %t19 = fadd float %t16, %t17
-   %e12 = fadd float %t18, %t19
-   %t20 = extractelement <4 x float> %res13, i32 0
-   %t21 = extractelement <4 x float> %res13, i32 2
-   %t22 = extractelement <4 x float> %res13, i32 3
-   %t23 = fadd float %t20, %t21
-   %e13 = fadd float %t22, %t23
-   %t24 = extractelement <4 x float> %res14, i32 1
-   %t25 = extractelement <4 x float> %res14, i32 2
-   %t26 = extractelement <4 x float> %res14, i32 3
-   %t27 = fadd float %t24, %t25
-   %e14 = fadd float %t26, %t27
-   %t28 = extractelement <4 x float> %res15, i32 0
-   %t29 = extractelement <4 x float> %res15, i32 1
-   %t30 = extractelement <4 x float> %res15, i32 2
-   %t31 = extractelement <4 x float> %res15, i32 3
-   %t32 = fadd float %t28, %t29
-   %t33 = fadd float %t30, %t31
-   %e15 = fadd float %t32, %t33
-   %e16 = extractelement <4 x float> %res16, i32 3
-   %s1 = fadd float %e1, %e2
-   %s2 = fadd float %s1, %e3
-   %s3 = fadd float %s2, %e4
-   %s4 = fadd float %s3, %e5
-   %s5 = fadd float %s4, %e6
-   %s6 = fadd float %s5, %e7
-   %s7 = fadd float %s6, %e8
-   %s8 = fadd float %s7, %e9
-   %s9 = fadd float %s8, %e10
-   %s10 = fadd float %s9, %e11
-   %s11 = fadd float %s10, %e12
-   %s12 = fadd float %s11, %e13
-   %s13 = fadd float %s12, %e14
-   %s14 = fadd float %s13, %e15
-   %s15 = fadd float %s14, %e16
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15)
-   ret void
-}
-
-; CHECK: {{^}}v1:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
-define void @v1(i32 %a1) #0 {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
-  %2 = extractelement <4 x float> %1, i32 0
-  %3 = extractelement <4 x float> %1, i32 1
-  %4 = extractelement <4 x float> %1, i32 2
-  %5 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %5)
-  ret void
-}
-
-
-declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
-
-declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll
deleted file mode 100644
index f2badff2a99..00000000000
--- a/test/CodeGen/R600/llvm.SI.sampled.ll
+++ /dev/null
@@ -1,143 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 3
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 2
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 1
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 4
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 5
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 9
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 6
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 10
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 12
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 7
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 11
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 13
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 14
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8
-
-define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
-   %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
-   %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
-   %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
-   %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
-   %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
-   %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
-   %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2
-   %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3
-   %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0
-   %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
-   %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
-   %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3
-   %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0
-   %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1
-   %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
-   %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
-   %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1,
-      <32 x i8> undef, <16 x i8> undef, i32 1)
-   %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2,
-      <32 x i8> undef, <16 x i8> undef, i32 2)
-   %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3,
-      <32 x i8> undef, <16 x i8> undef, i32 3)
-   %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4,
-      <32 x i8> undef, <16 x i8> undef, i32 4)
-   %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5,
-      <32 x i8> undef, <16 x i8> undef, i32 5)
-   %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6,
-      <32 x i8> undef, <16 x i8> undef, i32 6)
-   %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7,
-      <32 x i8> undef, <16 x i8> undef, i32 7)
-   %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8,
-      <32 x i8> undef, <16 x i8> undef, i32 8)
-   %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9,
-      <32 x i8> undef, <16 x i8> undef, i32 9)
-   %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10,
-      <32 x i8> undef, <16 x i8> undef, i32 10)
-   %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11,
-      <32 x i8> undef, <16 x i8> undef, i32 11)
-   %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12,
-      <32 x i8> undef, <16 x i8> undef, i32 12)
-   %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13,
-      <32 x i8> undef, <16 x i8> undef, i32 13)
-   %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14,
-      <32 x i8> undef, <16 x i8> undef, i32 14)
-   %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15,
-      <32 x i8> undef, <16 x i8> undef, i32 15)
-   %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16,
-      <32 x i8> undef, <16 x i8> undef, i32 16)
-   %e1 = extractelement <4 x float> %res1, i32 0
-   %e2 = extractelement <4 x float> %res2, i32 1
-   %e3 = extractelement <4 x float> %res3, i32 2
-   %e4 = extractelement <4 x float> %res4, i32 3
-   %t0 = extractelement <4 x float> %res5, i32 0
-   %t1 = extractelement <4 x float> %res5, i32 1
-   %e5 = fadd float %t0, %t1
-   %t2 = extractelement <4 x float> %res6, i32 0
-   %t3 = extractelement <4 x float> %res6, i32 2
-   %e6 = fadd float %t2, %t3
-   %t4 = extractelement <4 x float> %res7, i32 0
-   %t5 = extractelement <4 x float> %res7, i32 3
-   %e7 = fadd float %t4, %t5
-   %t6 = extractelement <4 x float> %res8, i32 1
-   %t7 = extractelement <4 x float> %res8, i32 2
-   %e8 = fadd float %t6, %t7
-   %t8 = extractelement <4 x float> %res9, i32 1
-   %t9 = extractelement <4 x float> %res9, i32 3
-   %e9 = fadd float %t8, %t9
-   %t10 = extractelement <4 x float> %res10, i32 2
-   %t11 = extractelement <4 x float> %res10, i32 3
-   %e10 = fadd float %t10, %t11
-   %t12 = extractelement <4 x float> %res11, i32 0
-   %t13 = extractelement <4 x float> %res11, i32 1
-   %t14 = extractelement <4 x float> %res11, i32 2
-   %t15 = fadd float %t12, %t13
-   %e11 = fadd float %t14, %t15
-   %t16 = extractelement <4 x float> %res12, i32 0
-   %t17 = extractelement <4 x float> %res12, i32 1
-   %t18 = extractelement <4 x float> %res12, i32 3
-   %t19 = fadd float %t16, %t17
-   %e12 = fadd float %t18, %t19
-   %t20 = extractelement <4 x float> %res13, i32 0
-   %t21 = extractelement <4 x float> %res13, i32 2
-   %t22 = extractelement <4 x float> %res13, i32 3
-   %t23 = fadd float %t20, %t21
-   %e13 = fadd float %t22, %t23
-   %t24 = extractelement <4 x float> %res14, i32 1
-   %t25 = extractelement <4 x float> %res14, i32 2
-   %t26 = extractelement <4 x float> %res14, i32 3
-   %t27 = fadd float %t24, %t25
-   %e14 = fadd float %t26, %t27
-   %t28 = extractelement <4 x float> %res15, i32 0
-   %t29 = extractelement <4 x float> %res15, i32 1
-   %t30 = extractelement <4 x float> %res15, i32 2
-   %t31 = extractelement <4 x float> %res15, i32 3
-   %t32 = fadd float %t28, %t29
-   %t33 = fadd float %t30, %t31
-   %e15 = fadd float %t32, %t33
-   %e16 = extractelement <4 x float> %res16, i32 3
-   %s1 = fadd float %e1, %e2
-   %s2 = fadd float %s1, %e3
-   %s3 = fadd float %s2, %e4
-   %s4 = fadd float %s3, %e5
-   %s5 = fadd float %s4, %e6
-   %s6 = fadd float %s5, %e7
-   %s7 = fadd float %s6, %e8
-   %s8 = fadd float %s7, %e9
-   %s9 = fadd float %s8, %e10
-   %s10 = fadd float %s9, %e11
-   %s11 = fadd float %s10, %e12
-   %s12 = fadd float %s11, %e13
-   %s13 = fadd float %s12, %e14
-   %s14 = fadd float %s13, %e15
-   %s15 = fadd float %s14, %e16
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15)
-   ret void
-}
-
-declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.SI.sendmsg-m0.ll b/test/CodeGen/R600/llvm.SI.sendmsg-m0.ll
deleted file mode 100644
index 2198590f2df..00000000000
--- a/test/CodeGen/R600/llvm.SI.sendmsg-m0.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=BOTH %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=BOTH %s
-
-; BOTH-LABEL: {{^}}main:
-; BOTH: s_mov_b32 m0, s0
-; VI-NEXT: s_nop 0
-; BOTH-NEXT: s_sendmsg Gs_done(nop)
-; BOTH-NEXT: s_endpgm
-
-define void @main(i32 inreg %a) #0 {
-main_body:
-  call void @llvm.SI.sendmsg(i32 3, i32 %a)
-  ret void
-}
-
-; Function Attrs: nounwind
-declare void @llvm.SI.sendmsg(i32, i32) #1
-
-attributes #0 = { "ShaderType"="2" "unsafe-fp-math"="true" }
-attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/llvm.SI.sendmsg.ll b/test/CodeGen/R600/llvm.SI.sendmsg.ll
deleted file mode 100644
index 09675d50335..00000000000
--- a/test/CodeGen/R600/llvm.SI.sendmsg.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; CHECK-LABEL: {{^}}main:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg Gs(emit stream 0)
-; CHECK: s_sendmsg Gs(cut stream 1)
-; CHECK: s_sendmsg Gs(emit-cut stream 2)
-; CHECK: s_sendmsg Gs_done(nop)
-
-define void @main() {
-main_body:
-  call void @llvm.SI.sendmsg(i32 34, i32 0);
-  call void @llvm.SI.sendmsg(i32 274, i32 0);
-  call void @llvm.SI.sendmsg(i32 562, i32 0);
-  call void @llvm.SI.sendmsg(i32 3, i32 0);
-  ret void
-}
-
-; Function Attrs: nounwind
-declare void @llvm.SI.sendmsg(i32, i32) #0
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
deleted file mode 100644
index 71f51548a5f..00000000000
--- a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}test1:
-;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test1(i32 %a1, i32 %vaddr) #0 {
-    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
-        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
-        i32 1, i32 0)
-    ret void
-}
-
-;CHECK-LABEL: {{^}}test2:
-;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test2(i32 %a1, i32 %vaddr) #0 {
-    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
-        i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1,
-        i32 1, i32 0)
-    ret void
-}
-
-;CHECK-LABEL: {{^}}test3:
-;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test3(i32 %a1, i32 %vaddr) #0 {
-    %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
-        i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1,
-        i32 1, i32 0)
-    ret void
-}
-
-;CHECK-LABEL: {{^}}test4:
-;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test4(i32 %vdata, i32 %vaddr) #0 {
-    call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
-        i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
-        i32 1, i32 0)
-    ret void
-}
-
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/llvm.SI.tid.ll b/test/CodeGen/R600/llvm.SI.tid.ll
deleted file mode 100644
index f6e6d7050ba..00000000000
--- a/test/CodeGen/R600/llvm.SI.tid.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
-
-;GCN: v_mbcnt_lo_u32_b32_e64
-;SI: v_mbcnt_hi_u32_b32_e32
-;VI: v_mbcnt_hi_u32_b32_e64
-
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
-main_body:
-  %4 = call i32 @llvm.SI.tid()
-  %5 = bitcast i32 %4 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
-  ret void
-}
-
-declare i32 @llvm.SI.tid() readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.amdgpu.dp4.ll b/test/CodeGen/R600/llvm.amdgpu.dp4.ll
deleted file mode 100644
index 036cd2ca82a..00000000000
--- a/test/CodeGen/R600/llvm.amdgpu.dp4.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s
-
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone
-
-define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
-  %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
-  %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16
-  %dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
-  store float %dp4, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.amdgpu.kilp.ll b/test/CodeGen/R600/llvm.amdgpu.kilp.ll
deleted file mode 100644
index 42df6db1ccf..00000000000
--- a/test/CodeGen/R600/llvm.amdgpu.kilp.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}kilp_gs_const:
-; SI: s_mov_b64 exec, 0
-define void @kilp_gs_const() #0 {
-main_body:
-  %0 = icmp ule i32 0, 3
-  %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kilp(float %1)
-  %2 = icmp ule i32 3, 0
-  %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kilp(float %3)
-  ret void
-}
-
-declare void @llvm.AMDGPU.kilp(float)
-
-attributes #0 = { "ShaderType"="2" }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.amdgpu.lrp.ll b/test/CodeGen/R600/llvm.amdgpu.lrp.ll
deleted file mode 100644
index 4e4c2ec7791..00000000000
--- a/test/CodeGen/R600/llvm.amdgpu.lrp.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone
-
-; FUNC-LABEL: {{^}}test_lrp:
-; SI: v_sub_f32
-; SI: v_mad_f32
-define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind {
-  %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone
-  store float %mad, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
deleted file mode 100644
index c65df8b3e8d..00000000000
--- a/test/CodeGen/R600/llvm.cos.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC
-;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-
-;FUNC-LABEL: test
-;EG: MULADD_IEEE *
-;EG: FRACT *
-;EG: ADD *
-;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG-NOT: COS
-;SI: v_cos_f32
-;SI-NOT: v_cos_f32
-
-define void @test(float addrspace(1)* %out, float %x) #1 {
-   %cos = call float @llvm.cos.f32(float %x)
-   store float %cos, float addrspace(1)* %out
-   ret void
-}
-
-;FUNC-LABEL: testv
-;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG-NOT: COS
-;SI: v_cos_f32
-;SI: v_cos_f32
-;SI: v_cos_f32
-;SI: v_cos_f32
-;SI-NOT: v_cos_f32
-
-define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
-   %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx)
-   store <4 x float> %cos, <4 x float> addrspace(1)* %out
-   ret void
-}
-
-declare float @llvm.cos.f32(float) readnone
-declare <4 x float> @llvm.cos.v4f32(<4 x float>) readnone
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.exp2.ll b/test/CodeGen/R600/llvm.exp2.ll
deleted file mode 100644
index 42698925aae..00000000000
--- a/test/CodeGen/R600/llvm.exp2.ll
+++ /dev/null
@@ -1,80 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
-;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-
-;FUNC-LABEL: {{^}}test:
-;EG: EXP_IEEE
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;SI: v_exp_f32
-
-define void @test(float addrspace(1)* %out, float %in) {
-entry:
-   %0 = call float @llvm.exp2.f32(float %in)
-   store float %0, float addrspace(1)* %out
-   ret void
-}
-
-;FUNC-LABEL: {{^}}testv2:
-;EG: EXP_IEEE
-;EG: EXP_IEEE
-; FIXME: We should be able to merge these packets together on Cayman so we
-; have a maximum of 4 instructions.
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;SI: v_exp_f32
-;SI: v_exp_f32
-
-define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in)
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-;FUNC-LABEL: {{^}}testv4:
-;EG: EXP_IEEE
-;EG: EXP_IEEE
-;EG: EXP_IEEE
-;EG: EXP_IEEE
-; FIXME: We should be able to merge these packets together on Cayman so we
-; have a maximum of 4 instructions.
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;SI: v_exp_f32
-;SI: v_exp_f32
-;SI: v_exp_f32
-;SI: v_exp_f32
-define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
-  %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-declare float @llvm.exp2.f32(float) readnone
-declare <2 x float> @llvm.exp2.v2f32(<2 x float>) readnone
-declare <4 x float> @llvm.exp2.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/R600/llvm.log2.ll b/test/CodeGen/R600/llvm.log2.ll
deleted file mode 100644
index c75e7850b35..00000000000
--- a/test/CodeGen/R600/llvm.log2.ll
+++ /dev/null
@@ -1,80 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
-;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-
-;FUNC-LABEL: {{^}}test:
-;EG: LOG_IEEE
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;SI: v_log_f32
-
-define void @test(float addrspace(1)* %out, float %in) {
-entry:
-   %0 = call float @llvm.log2.f32(float %in)
-   store float %0, float addrspace(1)* %out
-   ret void
-}
-
-;FUNC-LABEL: {{^}}testv2:
-;EG: LOG_IEEE
-;EG: LOG_IEEE
-; FIXME: We should be able to merge these packets together on Cayman so we
-; have a maximum of 4 instructions.
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;SI: v_log_f32
-;SI: v_log_f32
-
-define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in)
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-;FUNC-LABEL: {{^}}testv4:
-;EG: LOG_IEEE
-;EG: LOG_IEEE
-;EG: LOG_IEEE
-;EG: LOG_IEEE
-; FIXME: We should be able to merge these packets together on Cayman so we
-; have a maximum of 4 instructions.
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;SI: v_log_f32
-;SI: v_log_f32
-;SI: v_log_f32
-;SI: v_log_f32
-define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
-  %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-declare float @llvm.log2.f32(float) readnone
-declare <2 x float> @llvm.log2.v2f32(<2 x float>) readnone
-declare <4 x float> @llvm.log2.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/R600/llvm.memcpy.ll b/test/CodeGen/R600/llvm.memcpy.ll
deleted file mode 100644
index e491732cf9c..00000000000
--- a/test/CodeGen/R600/llvm.memcpy.ll
+++ /dev/null
@@ -1,365 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind
-declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
-
-
-; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-
-; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
-  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
-  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2:
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-
-; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
-  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
-  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4:
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
-  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
-  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind
-  ret void
-}
-
-; FIXME: Use 64-bit ops
-; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8:
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_write_b32
-
-; SI-DAG: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
-  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
-  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1:
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_load_ubyte
-; SI-DAG: buffer_store_byte
-
-; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
-  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
-  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
-  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2:
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
-
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-; SI-DAG: buffer_store_short
-
-; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
-  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
-  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
-  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
-  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
-  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
-  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
-  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
-  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
-  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
-  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
-  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
-  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.pow.ll b/test/CodeGen/R600/llvm.pow.ll
deleted file mode 100644
index c4ae652619c..00000000000
--- a/test/CodeGen/R600/llvm.pow.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK-LABEL: test1:
-;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
-;CHECK-NEXT: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
-;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
-
-define void @test1(<4 x float> inreg %reg0) #0 {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = extractelement <4 x float> %reg0, i32 1
-   %r2 = call float @llvm.pow.f32( float %r0, float %r1)
-   %vec = insertelement <4 x float> undef, float %r2, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
-   ret void
-}
-
-;CHECK-LABEL: test2:
-;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
-;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
-;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
-;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
-;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
-;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
-;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
-;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
-;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
-;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
-;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
-;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
-define void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
-   %vec = call <4 x float> @llvm.pow.v4f32( <4 x float> %reg0, <4 x float> %reg1)
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
-   ret void
-}
-
-declare float @llvm.pow.f32(float ,float ) readonly
-declare <4 x float> @llvm.pow.v4f32(<4 x float> ,<4 x float> ) readonly
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.rint.f64.ll b/test/CodeGen/R600/llvm.rint.f64.ll
deleted file mode 100644
index c63fb172794..00000000000
--- a/test/CodeGen/R600/llvm.rint.f64.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}rint_f64:
-; CI: v_rndne_f64_e32
-
-; SI-DAG: v_add_f64
-; SI-DAG: v_add_f64
-; SI-DAG v_cmp_gt_f64_e64
-; SI: v_cndmask_b32
-; SI: v_cndmask_b32
-; SI: s_endpgm
-define void @rint_f64(double addrspace(1)* %out, double %in) {
-entry:
-  %0 = call double @llvm.rint.f64(double %in)
-  store double %0, double addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rint_v2f64:
-; CI: v_rndne_f64_e32
-; CI: v_rndne_f64_e32
-define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
-entry:
-  %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in)
-  store <2 x double> %0, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rint_v4f64:
-; CI: v_rndne_f64_e32
-; CI: v_rndne_f64_e32
-; CI: v_rndne_f64_e32
-; CI: v_rndne_f64_e32
-define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
-entry:
-  %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in)
-  store <4 x double> %0, <4 x double> addrspace(1)* %out
-  ret void
-}
-
-
-declare double @llvm.rint.f64(double) #0
-declare <2 x double> @llvm.rint.v2f64(<2 x double>) #0
-declare <4 x double> @llvm.rint.v4f64(<4 x double>) #0
diff --git a/test/CodeGen/R600/llvm.rint.ll b/test/CodeGen/R600/llvm.rint.ll
deleted file mode 100644
index 661db51ad03..00000000000
--- a/test/CodeGen/R600/llvm.rint.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}rint_f32:
-; R600: RNDNE
-
-; SI: v_rndne_f32_e32
-define void @rint_f32(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = call float @llvm.rint.f32(float %in) #0
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rint_v2f32:
-; R600: RNDNE
-; R600: RNDNE
-
-; SI: v_rndne_f32_e32
-; SI: v_rndne_f32_e32
-define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rint_v4f32:
-; R600: RNDNE
-; R600: RNDNE
-; R600: RNDNE
-; R600: RNDNE
-
-; SI: v_rndne_f32_e32
-; SI: v_rndne_f32_e32
-; SI: v_rndne_f32_e32
-; SI: v_rndne_f32_e32
-define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
-  %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}legacy_amdil_round_nearest_f32:
-; R600: RNDNE
-
-; SI: v_rndne_f32_e32
-define void @legacy_amdil_round_nearest_f32(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = call float @llvm.AMDIL.round.nearest.f32(float %in) #0
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-declare float @llvm.AMDIL.round.nearest.f32(float) #0
-declare float @llvm.rint.f32(float) #0
-declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0
-declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.round.f64.ll b/test/CodeGen/R600/llvm.round.f64.ll
deleted file mode 100644
index 3d0f57e3328..00000000000
--- a/test/CodeGen/R600/llvm.round.f64.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}round_f64:
-; SI: s_endpgm
-define void @round_f64(double addrspace(1)* %out, double %x) #0 {
-  %result = call double @llvm.round.f64(double %x) #1
-  store double %result, double addrspace(1)* %out
-  ret void
-}
-
-; This is a pretty large function, so just test a few of the
-; instructions that are necessary.
-
-; FUNC-LABEL: {{^}}v_round_f64:
-; SI: buffer_load_dwordx2
-; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11
-
-; SI-DAG: v_not_b32_e32
-; SI-DAG: v_not_b32_e32
-
-; SI-DAG: v_cmp_eq_i32
-
-; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff
-; SI-DAG: v_cmp_gt_i32_e64
-; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]]
-
-; SI-DAG: v_cmp_gt_i32_e64
-
-
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
-  %x = load double, double addrspace(1)* %gep
-  %result = call double @llvm.round.f64(double %x) #1
-  store double %result, double addrspace(1)* %out.gep
-  ret void
-}
-
-; FUNC-LABEL: {{^}}round_v2f64:
-; SI: s_endpgm
-define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
-  %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
-  store <2 x double> %result, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}round_v4f64:
-; SI: s_endpgm
-define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
-  %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
-  store <4 x double> %result, <4 x double> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}round_v8f64:
-; SI: s_endpgm
-define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
-  %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
-  store <8 x double> %result, <8 x double> addrspace(1)* %out
-  ret void
-}
-
-declare i32 @llvm.r600.read.tidig.x() #1
-
-declare double @llvm.round.f64(double) #1
-declare <2 x double> @llvm.round.v2f64(<2 x double>) #1
-declare <4 x double> @llvm.round.v4f64(<4 x double>) #1
-declare <8 x double> @llvm.round.v8f64(<8 x double>) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.round.ll b/test/CodeGen/R600/llvm.round.ll
deleted file mode 100644
index f5f124d915a..00000000000
--- a/test/CodeGen/R600/llvm.round.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}round_f32:
-; SI-DAG: s_load_dword [[SX:s[0-9]+]]
-; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff
-; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
-; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
-; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
-; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
-; SI: v_cmp_le_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0.5, |[[SUB]]|
-; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]]
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
-; SI: buffer_store_dword [[RESULT]]
-
-; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
-; R600-DAG: ADD  {{.*}},
-; R600-DAG: BFI_INT
-; R600-DAG: SETGE
-; R600-DAG: CNDE
-; R600-DAG: ADD
-define void @round_f32(float addrspace(1)* %out, float %x) #0 {
-  %result = call float @llvm.round.f32(float %x) #1
-  store float %result, float addrspace(1)* %out
-  ret void
-}
-
-; The vector tests are really difficult to verify, since it can be hard to
-; predict how the scheduler will order the instructions.  We already have
-; a test for the scalar case, so the vector tests just check that the
-; compiler doesn't crash.
-
-; FUNC-LABEL: {{^}}round_v2f32:
-; SI: s_endpgm
-; R600: CF_END
-define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
-  %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1
-  store <2 x float> %result, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}round_v4f32:
-; SI: s_endpgm
-; R600: CF_END
-define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
-  %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}round_v8f32:
-; SI: s_endpgm
-; R600: CF_END
-define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
-  %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
-  store <8 x float> %result, <8 x float> addrspace(1)* %out
-  ret void
-}
-
-declare float @llvm.round.f32(float) #1
-declare <2 x float> @llvm.round.v2f32(<2 x float>) #1
-declare <4 x float> @llvm.round.v4f32(<4 x float>) #1
-declare <8 x float> @llvm.round.v8f32(<8 x float>) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
deleted file mode 100644
index 3bb245c2e24..00000000000
--- a/test/CodeGen/R600/llvm.sin.ll
+++ /dev/null
@@ -1,92 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
-
-; FUNC-LABEL: sin_f32
-; EG: MULADD_IEEE *
-; EG: FRACT *
-; EG: ADD *
-; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; EG-NOT: SIN
-; SI: v_mul_f32
-; SI: v_fract_f32
-; SI: v_sin_f32
-; SI-NOT: v_sin_f32
-
-define void @sin_f32(float addrspace(1)* %out, float %x) #1 {
-   %sin = call float @llvm.sin.f32(float %x)
-   store float %sin, float addrspace(1)* %out
-   ret void
-}
-
-; FUNC-LABEL: {{^}}sin_3x_f32:
-; SI-UNSAFE-NOT: v_add_f32
-; SI-UNSAFE: 0x3ef47644
-; SI-UNSAFE: v_mul_f32
-; SI-SAFE: v_mul_f32
-; SI-SAFE: v_mul_f32
-; SI: v_fract_f32
-; SI: v_sin_f32
-; SI-NOT: v_sin_f32
-define void @sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
-  %y = fmul float 3.0, %x
-  %sin = call float @llvm.sin.f32(float %y)
-  store float %sin, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sin_2x_f32:
-; SI-UNSAFE-NOT: v_add_f32
-; SI-UNSAFE: 0x3ea2f983
-; SI-UNSAFE: v_mul_f32
-; SI-SAFE: v_add_f32
-; SI-SAFE: v_mul_f32
-; SI: v_fract_f32
-; SI: v_sin_f32
-; SI-NOT: v_sin_f32
-define void @sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
-  %y = fmul float 2.0, %x
-  %sin = call float @llvm.sin.f32(float %y)
-  store float %sin, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_2sin_f32:
-; SI-UNSAFE: 0x3ea2f983
-; SI-UNSAFE: v_mul_f32
-; SI-SAFE: v_add_f32
-; SI-SAFE: v_mul_f32
-; SI: v_fract_f32
-; SI: v_sin_f32
-; SI-NOT: v_sin_f32
-define void @test_2sin_f32(float addrspace(1)* %out, float %x) #1 {
-   %y = fmul float 2.0, %x
-   %sin = call float @llvm.sin.f32(float %y)
-   store float %sin, float addrspace(1)* %out
-   ret void
-}
-
-; FUNC-LABEL: {{^}}sin_v4f32:
-; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; EG-NOT: SIN
-; SI: v_sin_f32
-; SI: v_sin_f32
-; SI: v_sin_f32
-; SI: v_sin_f32
-; SI-NOT: v_sin_f32
-
-define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
-   %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx)
-   store <4 x float> %sin, <4 x float> addrspace(1)* %out
-   ret void
-}
-
-declare float @llvm.sin.f32(float) readnone
-declare <4 x float> @llvm.sin.v4f32(<4 x float>) readnone
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.sqrt.ll b/test/CodeGen/R600/llvm.sqrt.ll
deleted file mode 100644
index c6da047f539..00000000000
--- a/test/CodeGen/R600/llvm.sqrt.ll
+++ /dev/null
@@ -1,105 +0,0 @@
-; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600
-; RUN: llc < %s -march=amdgcn --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI
-; RUN: llc < %s -march=amdgcn --mcpu=tonga -verify-machineinstrs| FileCheck %s --check-prefix=SI
-
-; R600-LABEL: {{^}}sqrt_f32:
-; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
-; SI-LABEL: {{^}}sqrt_f32:
-; SI: v_sqrt_f32_e32
-define void @sqrt_f32(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = call float @llvm.sqrt.f32(float %in)
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-; R600-LABEL: {{^}}sqrt_v2f32:
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
-; SI-LABEL: {{^}}sqrt_v2f32:
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-define void @sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %0 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; R600-LABEL: {{^}}sqrt_v4f32:
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
-; SI-LABEL: {{^}}sqrt_v4f32:
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-define void @sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
-  %0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}elim_redun_check:
-; SI: v_sqrt_f32_e32
-; SI-NOT: v_cndmask
-define void @elim_redun_check(float addrspace(1)* %out, float %in) {
-entry:
-  %sqrt = call float @llvm.sqrt.f32(float %in)
-  %cmp = fcmp olt float %in, -0.000000e+00
-  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
-  store float %res, float addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}elim_redun_check_ult:
-; SI: v_sqrt_f32_e32
-; SI-NOT: v_cndmask
-define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) {
-entry:
-  %sqrt = call float @llvm.sqrt.f32(float %in)
-  %cmp = fcmp ult float %in, -0.000000e+00
-  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
-  store float %res, float addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}elim_redun_check_v2:
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-; SI-NOT: v_cndmask
-define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
-  %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
-  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
-  store <2 x float> %res, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}elim_redun_check_v2_ult
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-; SI-NOT: v_cndmask
-define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
-  %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
-  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
-  store <2 x float> %res, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-declare float @llvm.sqrt.f32(float %in)
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/R600/load-i1.ll b/test/CodeGen/R600/load-i1.ll
deleted file mode 100644
index 0ca49fde3e7..00000000000
--- a/test/CodeGen/R600/load-i1.ll
+++ /dev/null
@@ -1,149 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}global_copy_i1_to_i1:
-; SI: buffer_load_ubyte
-; SI: v_and_b32_e32 v{{[0-9]+}}, 1
-; SI: buffer_store_byte
-; SI: s_endpgm
-
-; EG: VTX_READ_8
-; EG: AND_INT
-define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  store i1 %load, i1 addrspace(1)* %out, align 1
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_copy_i1_to_i1:
-; SI: ds_read_u8
-; SI: v_and_b32_e32 v{{[0-9]+}}, 1
-; SI: ds_write_b8
-; SI: s_endpgm
-
-; EG: LDS_UBYTE_READ_RET
-; EG: AND_INT
-; EG: LDS_BYTE_WRITE
-define void @local_copy_i1_to_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) nounwind {
-  %load = load i1, i1 addrspace(3)* %in
-  store i1 %load, i1 addrspace(3)* %out, align 1
-  ret void
-}
-
-; FUNC-LABEL: {{^}}constant_copy_i1_to_i1:
-; SI: buffer_load_ubyte
-; SI: v_and_b32_e32 v{{[0-9]+}}, 1
-; SI: buffer_store_byte
-; SI: s_endpgm
-
-; EG: VTX_READ_8
-; EG: AND_INT
-define void @constant_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) nounwind {
-  %load = load i1, i1 addrspace(2)* %in
-  store i1 %load, i1 addrspace(1)* %out, align 1
-  ret void
-}
-
-; FUNC-LABEL: {{^}}global_sextload_i1_to_i32:
-; SI: buffer_load_ubyte
-; SI: v_bfe_i32
-; SI: buffer_store_dword
-; SI: s_endpgm
-
-; EG: VTX_READ_8
-; EG: BFE_INT
-define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = sext i1 %load to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}global_zextload_i1_to_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-
-define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = zext i1 %load to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}global_sextload_i1_to_i64:
-; SI: buffer_load_ubyte
-; SI: v_bfe_i32
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = sext i1 %load to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}global_zextload_i1_to_i64:
-; SI: buffer_load_ubyte
-; SI: v_mov_b32_e32 {{v[0-9]+}}, 0
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = zext i1 %load to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg:
-; SI: buffer_load_ubyte
-; SI: v_and_b32_e32
-; SI: buffer_store_byte
-; SI: s_endpgm
-define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
-  store i1 %x, i1 addrspace(1)* %out, align 1
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg_zext_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
-  %ext = zext i1 %x to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg_zext_i64:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
-  %ext = zext i1 %x to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg_sext_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
-  %ext = sext i1 %x to i32
-  store i32 %ext, i32addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg_sext_i64:
-; SI: buffer_load_ubyte
-; SI: v_bfe_i32
-; SI: v_ashrrev_i32
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
-  %ext = sext i1 %x to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/load-input-fold.ll b/test/CodeGen/R600/load-input-fold.ll
deleted file mode 100644
index 1daf0e6527b..00000000000
--- a/test/CodeGen/R600/load-input-fold.ll
+++ /dev/null
@@ -1,117 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman
-
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
-main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = extractelement <4 x float> %reg1, i32 2
-  %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = extractelement <4 x float> %reg2, i32 0
-  %5 = extractelement <4 x float> %reg2, i32 1
-  %6 = extractelement <4 x float> %reg2, i32 2
-  %7 = extractelement <4 x float> %reg2, i32 3
-  %8 = extractelement <4 x float> %reg3, i32 0
-  %9 = extractelement <4 x float> %reg3, i32 1
-  %10 = extractelement <4 x float> %reg3, i32 2
-  %11 = extractelement <4 x float> %reg3, i32 3
-  %12 = load <4 x float>, <4 x float> addrspace(8)* null
-  %13 = extractelement <4 x float> %12, i32 0
-  %14 = fmul float %0, %13
-  %15 = load <4 x float>, <4 x float> addrspace(8)* null
-  %16 = extractelement <4 x float> %15, i32 1
-  %17 = fmul float %0, %16
-  %18 = load <4 x float>, <4 x float> addrspace(8)* null
-  %19 = extractelement <4 x float> %18, i32 2
-  %20 = fmul float %0, %19
-  %21 = load <4 x float>, <4 x float> addrspace(8)* null
-  %22 = extractelement <4 x float> %21, i32 3
-  %23 = fmul float %0, %22
-  %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %25 = extractelement <4 x float> %24, i32 0
-  %26 = fmul float %1, %25
-  %27 = fadd float %26, %14
-  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %29 = extractelement <4 x float> %28, i32 1
-  %30 = fmul float %1, %29
-  %31 = fadd float %30, %17
-  %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %33 = extractelement <4 x float> %32, i32 2
-  %34 = fmul float %1, %33
-  %35 = fadd float %34, %20
-  %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %37 = extractelement <4 x float> %36, i32 3
-  %38 = fmul float %1, %37
-  %39 = fadd float %38, %23
-  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %41 = extractelement <4 x float> %40, i32 0
-  %42 = fmul float %2, %41
-  %43 = fadd float %42, %27
-  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %45 = extractelement <4 x float> %44, i32 1
-  %46 = fmul float %2, %45
-  %47 = fadd float %46, %31
-  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %49 = extractelement <4 x float> %48, i32 2
-  %50 = fmul float %2, %49
-  %51 = fadd float %50, %35
-  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %53 = extractelement <4 x float> %52, i32 3
-  %54 = fmul float %2, %53
-  %55 = fadd float %54, %39
-  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %57 = extractelement <4 x float> %56, i32 0
-  %58 = fmul float %3, %57
-  %59 = fadd float %58, %43
-  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %61 = extractelement <4 x float> %60, i32 1
-  %62 = fmul float %3, %61
-  %63 = fadd float %62, %47
-  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %65 = extractelement <4 x float> %64, i32 2
-  %66 = fmul float %3, %65
-  %67 = fadd float %66, %51
-  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %69 = extractelement <4 x float> %68, i32 3
-  %70 = fmul float %3, %69
-  %71 = fadd float %70, %55
-  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %73 = extractelement <4 x float> %72, i32 0
-  %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %75 = extractelement <4 x float> %74, i32 1
-  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %77 = extractelement <4 x float> %76, i32 2
-  %78 = insertelement <4 x float> undef, float %4, i32 0
-  %79 = insertelement <4 x float> %78, float %5, i32 1
-  %80 = insertelement <4 x float> %79, float %6, i32 2
-  %81 = insertelement <4 x float> %80, float 0.000000e+00, i32 3
-  %82 = insertelement <4 x float> undef, float %73, i32 0
-  %83 = insertelement <4 x float> %82, float %75, i32 1
-  %84 = insertelement <4 x float> %83, float %77, i32 2
-  %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
-  %86 = call float @llvm.AMDGPU.dp4(<4 x float> %81, <4 x float> %85)
-  %87 = insertelement <4 x float> undef, float %86, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %87, i32 2, i32 2)
-  ret void
-}
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-
-; Function Attrs: readonly
-declare float @fabs(float) #2
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq(float) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDIL.clamp.(float, float, float) #1
-
-; Function Attrs: nounwind readonly
-declare float @llvm.pow.f32(float, float) #3
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
-attributes #2 = { readonly }
-attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
deleted file mode 100644
index 93b1b51a0d0..00000000000
--- a/test/CodeGen/R600/load.ll
+++ /dev/null
@@ -1,709 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-
-;===------------------------------------------------------------------------===;
-; GLOBAL ADDRESS SPACE
-;===------------------------------------------------------------------------===;
-
-; Load an i8 value from the global address space.
-; FUNC-LABEL: {{^}}load_i8:
-; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-
-; SI: buffer_load_ubyte v{{[0-9]+}},
-define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %1 = load i8, i8 addrspace(1)* %in
-  %2 = zext i8 %1 to i32
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i8_sext:
-; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; R600: 8
-; SI: buffer_load_sbyte
-define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
-entry:
-  %0 = load i8, i8 addrspace(1)* %in
-  %1 = sext i8 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i8:
-; R600: VTX_READ_8
-; R600: VTX_READ_8
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
-entry:
-  %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
-  %1 = zext <2 x i8> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i8_sext:
-; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
-; R600-DAG: 8
-; R600-DAG: 8
-
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
-entry:
-  %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
-  %1 = sext <2 x i8> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i8:
-; R600: VTX_READ_8
-; R600: VTX_READ_8
-; R600: VTX_READ_8
-; R600: VTX_READ_8
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
-entry:
-  %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
-  %1 = zext <4 x i8> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i8_sext:
-; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
-; R600-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
-; R600-DAG: 8
-; R600-DAG: 8
-; R600-DAG: 8
-; R600-DAG: 8
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
-entry:
-  %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
-  %1 = sext <4 x i8> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; Load an i16 value from the global address space.
-; FUNC-LABEL: {{^}}load_i16:
-; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ushort
-define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
-entry:
-  %0 = load i16	, i16	 addrspace(1)* %in
-  %1 = zext i16 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i16_sext:
-; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; R600: 16
-; SI: buffer_load_sshort
-define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
-entry:
-  %0 = load i16, i16 addrspace(1)* %in
-  %1 = sext i16 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i16:
-; R600: VTX_READ_16
-; R600: VTX_READ_16
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
-entry:
-  %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
-  %1 = zext <2 x i16> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i16_sext:
-; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
-; R600-DAG: 16
-; R600-DAG: 16
-; SI: buffer_load_sshort
-; SI: buffer_load_sshort
-define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
-entry:
-  %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
-  %1 = sext <2 x i16> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i16:
-; R600: VTX_READ_16
-; R600: VTX_READ_16
-; R600: VTX_READ_16
-; R600: VTX_READ_16
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
-entry:
-  %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
-  %1 = zext <4 x i16> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i16_sext:
-; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
-; R600-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
-; R600-DAG: 16
-; R600-DAG: 16
-; R600-DAG: 16
-; R600-DAG: 16
-; SI: buffer_load_sshort
-; SI: buffer_load_sshort
-; SI: buffer_load_sshort
-; SI: buffer_load_sshort
-define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
-entry:
-  %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
-  %1 = sext <4 x i16> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; load an i32 value from the global address space.
-; FUNC-LABEL: {{^}}load_i32:
-; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-
-; SI: buffer_load_dword v{{[0-9]+}}
-define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %0 = load i32, i32 addrspace(1)* %in
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; load a f32 value from the global address space.
-; FUNC-LABEL: {{^}}load_f32:
-; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-
-; SI: buffer_load_dword v{{[0-9]+}}
-define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-entry:
-  %0 = load float, float addrspace(1)* %in
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-; load a v2f32 value from the global address space
-; FUNC-LABEL: {{^}}load_v2f32:
-; R600: MEM_RAT
-; R600: VTX_READ_64
-; SI: buffer_load_dwordx2
-define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
-entry:
-  %0 = load <2 x float>, <2 x float> addrspace(1)* %in
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i64:
-; R600: VTX_READ_64
-; SI: buffer_load_dwordx2
-define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-entry:
-  %0 = load i64, i64 addrspace(1)* %in
-  store i64 %0, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i64_sext:
-; R600: MEM_RAT
-; R600: MEM_RAT
-; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.x
-; R600: 31
-; SI: buffer_load_dword
-
-define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %0 = load i32, i32 addrspace(1)* %in
-  %1 = sext i32 %0 to i64
-  store i64 %1, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i64_zext:
-; R600: MEM_RAT
-; R600: MEM_RAT
-define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %0 = load i32, i32 addrspace(1)* %in
-  %1 = zext i32 %0 to i64
-  store i64 %1, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v8i32:
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-; XXX: We should be using DWORDX4 instructions on SI.
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
-entry:
-  %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in
-  store <8 x i32> %0, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v16i32:
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-; XXX: We should be using DWORDX4 instructions on SI.
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
-entry:
-  %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in
-  store <16 x i32> %0, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-;===------------------------------------------------------------------------===;
-; CONSTANT ADDRESS SPACE
-;===------------------------------------------------------------------------===;
-
-; Load a sign-extended i8 value
-; FUNC-LABEL: {{^}}load_const_i8_sext:
-; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; R600: 8
-; SI: buffer_load_sbyte v{{[0-9]+}},
-define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
-entry:
-  %0 = load i8, i8 addrspace(2)* %in
-  %1 = sext i8 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; Load an aligned i8 value
-; FUNC-LABEL: {{^}}load_const_i8_aligned:
-; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ubyte v{{[0-9]+}},
-define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
-entry:
-  %0 = load i8, i8 addrspace(2)* %in
-  %1 = zext i8 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; Load an un-aligned i8 value
-; FUNC-LABEL: {{^}}load_const_i8_unaligned:
-; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ubyte v{{[0-9]+}},
-define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
-entry:
-  %0 = getelementptr i8, i8 addrspace(2)* %in, i32 1
-  %1 = load i8, i8 addrspace(2)* %0
-  %2 = zext i8 %1 to i32
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; Load a sign-extended i16 value
-; FUNC-LABEL: {{^}}load_const_i16_sext:
-; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; R600: 16
-; SI: buffer_load_sshort
-define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
-entry:
-  %0 = load i16, i16 addrspace(2)* %in
-  %1 = sext i16 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; Load an aligned i16 value
-; FUNC-LABEL: {{^}}load_const_i16_aligned:
-; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ushort
-define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
-entry:
-  %0 = load i16, i16 addrspace(2)* %in
-  %1 = zext i16 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; Load an un-aligned i16 value
-; FUNC-LABEL: {{^}}load_const_i16_unaligned:
-; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ushort
-define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
-entry:
-  %0 = getelementptr i16, i16 addrspace(2)* %in, i32 1
-  %1 = load i16, i16 addrspace(2)* %0
-  %2 = zext i16 %1 to i32
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; Load an i32 value from the constant address space.
-; FUNC-LABEL: {{^}}load_const_addrspace_i32:
-; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-
-; SI: s_load_dword s{{[0-9]+}}
-define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
-entry:
-  %0 = load i32, i32 addrspace(2)* %in
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; Load a f32 value from the constant address space.
-; FUNC-LABEL: {{^}}load_const_addrspace_f32:
-; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-
-; SI: s_load_dword s{{[0-9]+}}
-define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) {
-  %1 = load float, float addrspace(2)* %in
-  store float %1, float addrspace(1)* %out
-  ret void
-}
-
-;===------------------------------------------------------------------------===;
-; LOCAL ADDRESS SPACE
-;===------------------------------------------------------------------------===;
-
-; Load an i8 value from the local address space.
-; FUNC-LABEL: {{^}}load_i8_local:
-; R600: LDS_UBYTE_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u8
-define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
-  %1 = load i8, i8 addrspace(3)* %in
-  %2 = zext i8 %1 to i32
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i8_sext_local:
-; R600: LDS_UBYTE_READ_RET
-; R600: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i8
-define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
-entry:
-  %0 = load i8, i8 addrspace(3)* %in
-  %1 = sext i8 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i8_local:
-; R600: LDS_UBYTE_READ_RET
-; R600: LDS_UBYTE_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u8
-; SI: ds_read_u8
-define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
-entry:
-  %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in
-  %1 = zext <2 x i8> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i8_sext_local:
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i8
-; SI: ds_read_i8
-define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
-entry:
-  %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in
-  %1 = sext <2 x i8> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i8_local:
-; R600: LDS_UBYTE_READ_RET
-; R600: LDS_UBYTE_READ_RET
-; R600: LDS_UBYTE_READ_RET
-; R600: LDS_UBYTE_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
-entry:
-  %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in
-  %1 = zext <4 x i8> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i8_sext_local:
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i8
-; SI: ds_read_i8
-; SI: ds_read_i8
-; SI: ds_read_i8
-define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
-entry:
-  %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in
-  %1 = sext <4 x i8> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; Load an i16 value from the local address space.
-; FUNC-LABEL: {{^}}load_i16_local:
-; R600: LDS_USHORT_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u16
-define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
-entry:
-  %0 = load i16	, i16	 addrspace(3)* %in
-  %1 = zext i16 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i16_sext_local:
-; R600: LDS_USHORT_READ_RET
-; R600: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i16
-define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
-entry:
-  %0 = load i16, i16 addrspace(3)* %in
-  %1 = sext i16 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i16_local:
-; R600: LDS_USHORT_READ_RET
-; R600: LDS_USHORT_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u16
-; SI: ds_read_u16
-define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
-entry:
-  %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in
-  %1 = zext <2 x i16> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i16_sext_local:
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i16
-; SI: ds_read_i16
-define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
-entry:
-  %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in
-  %1 = sext <2 x i16> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i16_local:
-; R600: LDS_USHORT_READ_RET
-; R600: LDS_USHORT_READ_RET
-; R600: LDS_USHORT_READ_RET
-; R600: LDS_USHORT_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
-entry:
-  %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in
-  %1 = zext <4 x i16> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i16_sext_local:
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i16
-; SI: ds_read_i16
-; SI: ds_read_i16
-; SI: ds_read_i16
-define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
-entry:
-  %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in
-  %1 = sext <4 x i16> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; load an i32 value from the local address space.
-; FUNC-LABEL: {{^}}load_i32_local:
-; R600: LDS_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_b32
-define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
-entry:
-  %0 = load i32, i32 addrspace(3)* %in
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; load a f32 value from the local address space.
-; FUNC-LABEL: {{^}}load_f32_local:
-; R600: LDS_READ_RET
-; SI: s_mov_b32 m0
-; SI: ds_read_b32
-define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) {
-entry:
-  %0 = load float, float addrspace(3)* %in
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-; load a v2f32 value from the local address space
-; FUNC-LABEL: {{^}}load_v2f32_local:
-; R600: LDS_READ_RET
-; R600: LDS_READ_RET
-; SI: s_mov_b32 m0
-; SI: ds_read_b64
-define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) {
-entry:
-  %0 = load <2 x float>, <2 x float> addrspace(3)* %in
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; Test loading a i32 and v2i32 value from the same base pointer.
-; FUNC-LABEL: {{^}}load_i32_v2i32_local:
-; R600: LDS_READ_RET
-; R600: LDS_READ_RET
-; R600: LDS_READ_RET
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_read2_b32
-define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) {
-  %scalar = load i32, i32 addrspace(3)* %in
-  %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
-  %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
-  %vec0 = load <2 x i32>, <2 x i32> addrspace(3)* %vec_ptr, align 4
-  %vec1 = insertelement <2 x i32> <i32 0, i32 0>, i32 %scalar, i32 0
-  %vec = add <2 x i32> %vec0, %vec1
-  store <2 x i32> %vec, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-
-@lds = addrspace(3) global [512 x i32] undef, align 4
-
-; On SI we need to make sure that the base offset is a register and not
-; an immediate.
-; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
-; SI: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
-; SI: ds_read_b32 v0, v[[ZERO]] offset:4
-; R600: LDS_READ_RET
-define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
-entry:
-  %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
-  %tmp1 = load i32, i32 addrspace(3)* %tmp0
-  %tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  store i32 %tmp1, i32 addrspace(1)* %tmp2
-  ret void
-}
diff --git a/test/CodeGen/R600/load.vec.ll b/test/CodeGen/R600/load.vec.ll
deleted file mode 100644
index 02f883cd8e9..00000000000
--- a/test/CodeGen/R600/load.vec.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG  %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI  %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI  %s
-
-; load a v2i32 value from the global address space.
-; EG: {{^}}load_v2i32:
-; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0
-; SI: {{^}}load_v2i32:
-; SI: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
-define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  store <2 x i32> %a, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; load a v4i32 value from the global address space.
-; EG: {{^}}load_v4i32:
-; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0
-; SI: {{^}}load_v4i32:
-; SI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}]
-define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  store <4 x i32> %a, <4 x i32> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/load64.ll b/test/CodeGen/R600/load64.ll
deleted file mode 100644
index 74beabdc007..00000000000
--- a/test/CodeGen/R600/load64.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; load a f64 value from the global address space.
-; CHECK-LABEL: {{^}}load_f64:
-; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
-; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
-define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-  %1 = load double, double addrspace(1)* %in
-  store double %1, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}load_i64:
-; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
-; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
-define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %tmp = load i64, i64 addrspace(1)* %in
-  store i64 %tmp, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; Load a f64 value from the constant address space.
-; CHECK-LABEL: {{^}}load_const_addrspace_f64:
-; CHECK: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
-; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
-define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) {
-  %1 = load double, double addrspace(2)* %in
-  store double %1, double addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/local-64.ll b/test/CodeGen/R600/local-64.ll
deleted file mode 100644
index 33f3159d13e..00000000000
--- a/test/CodeGen/R600/local-64.ll
+++ /dev/null
@@ -1,167 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
-
-; BOTH-LABEL: {{^}}local_i32_load
-; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
-; BOTH: buffer_store_dword [[REG]],
-define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
-  %val = load i32, i32 addrspace(3)* %gep, align 4
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_i32_load_0_offset
-; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
-; BOTH: buffer_store_dword [[REG]],
-define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
-  %val = load i32, i32 addrspace(3)* %in, align 4
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset:
-; BOTH-NOT: ADD
-; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
-; BOTH: buffer_store_byte [[REG]],
-define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
-  %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535
-  %val = load i8, i8 addrspace(3)* %gep, align 4
-  store i8 %val, i8 addrspace(1)* %out, align 4
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_i8_load_over_i16_max_offset:
-; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
-; SI, which is why it is being OR'd with the base pointer.
-; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
-; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
-; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
-; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
-; BOTH: buffer_store_byte [[REG]],
-define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
-  %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536
-  %val = load i8, i8 addrspace(3)* %gep, align 4
-  store i8 %val, i8 addrspace(1)* %out, align 4
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_i64_load:
-; BOTH-NOT: ADD
-; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
-; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7
-  %val = load i64, i64 addrspace(3)* %gep, align 8
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_i64_load_0_offset
-; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
-; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
-  %val = load i64, i64 addrspace(3)* %in, align 8
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_f64_load:
-; BOTH-NOT: ADD
-; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
-; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
-  %gep = getelementptr double, double addrspace(3)* %in, i32 7
-  %val = load double, double addrspace(3)* %gep, align 8
-  store double %val, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_f64_load_0_offset
-; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
-; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
-  %val = load double, double addrspace(3)* %in, align 8
-  store double %val, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_i64_store:
-; BOTH-NOT: ADD
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
-define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7
-  store i64 5678, i64 addrspace(3)* %gep, align 8
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_i64_store_0_offset:
-; BOTH-NOT: ADD
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
-  store i64 1234, i64 addrspace(3)* %out, align 8
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_f64_store:
-; BOTH-NOT: ADD
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
-define void @local_f64_store(double addrspace(3)* %out) nounwind {
-  %gep = getelementptr double, double addrspace(3)* %out, i32 7
-  store double 16.0, double addrspace(3)* %gep, align 8
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_f64_store_0_offset
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
-  store double 20.0, double addrspace(3)* %out, align 8
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_v2i64_store:
-; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120
-; BOTH: s_endpgm
-define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
-  %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
-  store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_v2i64_store_0_offset:
-; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
-; BOTH: s_endpgm
-define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
-  store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_v4i64_store:
-; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248
-; BOTH: s_endpgm
-define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
-  %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
-  store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
-  ret void
-}
-
-; BOTH-LABEL: {{^}}local_v4i64_store_0_offset:
-; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24
-; BOTH: s_endpgm
-define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
-  store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
-  ret void
-}
diff --git a/test/CodeGen/R600/local-atomics.ll b/test/CodeGen/R600/local-atomics.ll
deleted file mode 100644
index 2aaf977ab90..00000000000
--- a/test/CodeGen/R600/local-atomics.ll
+++ /dev/null
@@ -1,551 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
-; EG: LDS_WRXCHG_RET *
-; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; GCN: s_load_dword [[SPTR:s[0-9]+]],
-; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32_offset:
-; EG: LDS_WRXCHG_RET *
-; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; XXX - Is it really necessary to load 4 into VGPR?
-; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
-; EG: LDS_ADD_RET *
-; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; GCN: s_load_dword [[SPTR:s[0-9]+]],
-; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_offset:
-; EG: LDS_ADD_RET *
-; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_bad_si_offset:
-; EG: LDS_ADD_RET *
-; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
-  %sub = sub i32 %a, %b
-  %add = add i32 %sub, 4
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32:
-; EG: LDS_ADD_RET *
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]]
-; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
-; EG: LDS_ADD_RET *
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
-; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_bad_si_offset:
-; EG: LDS_ADD_RET *
-; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
-  %sub = sub i32 %a, %b
-  %add = add i32 %sub, 4
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32:
-; EG: LDS_SUB_RET *
-; GCN: ds_sub_rtn_u32
-; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32_offset:
-; EG: LDS_SUB_RET *
-; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32:
-; EG: LDS_SUB_RET *
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_dec_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]]
-; GCN: s_endpgm
-define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
-; EG: LDS_SUB_RET *
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
-; GCN: s_endpgm
-define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32:
-; EG: LDS_AND_RET *
-; GCN: ds_and_rtn_b32
-; GCN: s_endpgm
-define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32_offset:
-; EG: LDS_AND_RET *
-; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32:
-; EG: LDS_OR_RET *
-; GCN: ds_or_rtn_b32
-; GCN: s_endpgm
-define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32_offset:
-; EG: LDS_OR_RET *
-; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32:
-; EG: LDS_XOR_RET *
-; GCN: ds_xor_rtn_b32
-; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32_offset:
-; EG: LDS_XOR_RET *
-; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FIXME: There is no atomic nand instr
-; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
-;   store i32 %result, i32 addrspace(1)* %out, align 4
-;   ret void
-; }
-
-; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32:
-; EG: LDS_MIN_INT_RET *
-; GCN: ds_min_rtn_i32
-; GCN: s_endpgm
-define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32_offset:
-; EG: LDS_MIN_INT_RET *
-; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32:
-; EG: LDS_MAX_INT_RET *
-; GCN: ds_max_rtn_i32
-; GCN: s_endpgm
-define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32_offset:
-; EG: LDS_MAX_INT_RET *
-; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32:
-; EG: LDS_MIN_UINT_RET *
-; GCN: ds_min_rtn_u32
-; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32_offset:
-; EG: LDS_MIN_UINT_RET *
-; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32:
-; EG: LDS_MAX_UINT_RET *
-; GCN: ds_max_rtn_u32
-; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32_offset:
-; EG: LDS_MAX_UINT_RET *
-; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32:
-; GCN: s_load_dword [[SPTR:s[0-9]+]],
-; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
-; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset:
-; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
-  ret void
-}
-
-; XXX - Is it really necessary to load 4 into VGPR?
-; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32:
-; GCN: s_load_dword [[SPTR:s[0-9]+]],
-; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; GCN: ds_add_u32 [[VPTR]], [[DATA]]
-; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset:
-; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_bad_si_offset
-; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
-  %sub = sub i32 %a, %b
-  %add = add i32 %sub, 4
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32:
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]]
-; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
-; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_bad_si_offset:
-; SI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
-  %sub = sub i32 %a, %b
-  %add = add i32 %sub, 4
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32:
-; GCN: ds_sub_u32
-; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset:
-; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32:
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_dec_u32  v{{[0-9]+}}, [[NEGONE]]
-; GCN: s_endpgm
-define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
-; GCN: s_endpgm
-define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32:
-; GCN: ds_and_b32
-; GCN: s_endpgm
-define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset:
-; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32:
-; GCN: ds_or_b32
-; GCN: s_endpgm
-define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset:
-; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32:
-; GCN: ds_xor_b32
-; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset:
-; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
-  ret void
-}
-
-; FIXME: There is no atomic nand instr
-; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
-;   ret void
-; }
-
-; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32:
-; GCN: ds_min_i32
-; GCN: s_endpgm
-define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset:
-; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32:
-; GCN: ds_max_i32
-; GCN: s_endpgm
-define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset:
-; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32:
-; GCN: ds_min_u32
-; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset:
-; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32:
-; GCN: ds_max_u32
-; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset:
-; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
-  ret void
-}
diff --git a/test/CodeGen/R600/local-atomics64.ll b/test/CodeGen/R600/local-atomics64.ll
deleted file mode 100644
index 0ffa5e751b7..00000000000
--- a/test/CodeGen/R600/local-atomics64.ll
+++ /dev/null
@@ -1,470 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
-
-; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64:
-; GCN: ds_wrxchg_rtn_b64
-; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
-; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64:
-; GCN: ds_add_rtn_u64
-; GCN: s_endpgm
-define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
-; GCN: buffer_store_dwordx2 [[RESULT]],
-; GCN: s_endpgm
-define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
-  %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; GCN: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
-; GCN: buffer_store_dwordx2 [[RESULT]],
-; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64_offset:
-; GCN: ds_inc_rtn_u64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64:
-; GCN: ds_sub_rtn_u64
-; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
-; GCN: ds_sub_rtn_u64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; GCN: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
-; GCN: buffer_store_dwordx2 [[RESULT]],
-; GCN: s_endpgm
-define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64_offset:
-; GCN: ds_dec_rtn_u64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64:
-; GCN: ds_and_rtn_b64
-; GCN: s_endpgm
-define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
-; GCN: ds_and_rtn_b64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64:
-; GCN: ds_or_rtn_b64
-; GCN: s_endpgm
-define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
-; GCN: ds_or_rtn_b64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64:
-; GCN: ds_xor_rtn_b64
-; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
-; GCN: ds_xor_rtn_b64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FIXME: There is no atomic nand instr
-; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
-;   store i64 %result, i64 addrspace(1)* %out, align 8
-;   ret void
-; }
-
-; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64:
-; GCN: ds_min_rtn_i64
-; GCN: s_endpgm
-define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
-; GCN: ds_min_rtn_i64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64:
-; GCN: ds_max_rtn_i64
-; GCN: s_endpgm
-define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
-; GCN: ds_max_rtn_i64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64:
-; GCN: ds_min_rtn_u64
-; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
-; GCN: ds_min_rtn_u64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64:
-; GCN: ds_max_rtn_u64
-; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
-; GCN: ds_max_rtn_u64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64:
-; GCN: ds_wrxchg_rtn_b64
-; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
-; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64:
-; GCN: ds_add_u64
-; GCN: s_endpgm
-define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
-; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
-  %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; GCN: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
-; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset:
-; GCN: ds_inc_u64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64:
-; GCN: ds_sub_u64
-; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
-; GCN: ds_sub_u64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; GCN: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
-; GCN: s_endpgm
-define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset:
-; GCN: ds_dec_u64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64:
-; GCN: ds_and_b64
-; GCN: s_endpgm
-define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
-; GCN: ds_and_b64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64:
-; GCN: ds_or_b64
-; GCN: s_endpgm
-define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
-; GCN: ds_or_b64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64:
-; GCN: ds_xor_b64
-; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
-; GCN: ds_xor_b64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
-  ret void
-}
-
-; FIXME: There is no atomic nand instr
-; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
-;   ret void
-; }
-
-; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64:
-; GCN: ds_min_i64
-; GCN: s_endpgm
-define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
-; GCN: ds_min_i64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64:
-; GCN: ds_max_i64
-; GCN: s_endpgm
-define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
-; GCN: ds_max_i64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64:
-; GCN: ds_min_u64
-; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
-; GCN: ds_min_u64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64:
-; GCN: ds_max_u64
-; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
-; GCN: ds_max_u64 {{.*}} offset:32
-; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
-  ret void
-}
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
deleted file mode 100644
index 06a8b1246e6..00000000000
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=CI %s
-
-@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
-@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
-
-
-; Check that the LDS size emitted correctly
-; EG: .long 166120
-; EG-NEXT: .long 8
-; GCN: .long 47180
-; GCN-NEXT: .long 38792
-
-; EG: {{^}}local_memory_two_objects:
-
-; We would like to check the the lds writes are using different
-; addresses, but due to variations in the scheduler, we can't do
-; this consistently on evergreen GPUs.
-; EG: LDS_WRITE
-; EG: LDS_WRITE
-; GCN: ds_write_b32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
-; GCN-NOT: ds_write_b32 {{v[0-9]*}}, v[[ADDRW]]
-
-; GROUP_BARRIER must be the last instruction in a clause
-; EG: GROUP_BARRIER
-; EG-NEXT: ALU clause
-
-; Make sure the lds reads are using different addresses, at different
-; constant offsets.
-; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
-; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
-; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}}
-; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]]
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]]
-
-define void @local_memory_two_objects(i32 addrspace(1)* %out) {
-entry:
-  %x.i = call i32 @llvm.r600.read.tidig.x() #0
-  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
-  store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
-  %mul = shl nsw i32 %x.i, 1
-  %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
-  store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
-  %sub = sub nsw i32 3, %x.i
-  call void @llvm.AMDGPU.barrier.local()
-  %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
-  %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
-  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
-  %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
-  %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
-  %add = add nsw i32 %x.i, 4
-  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
-  store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
-  ret void
-}
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare void @llvm.AMDGPU.barrier.local()
-
-attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll
deleted file mode 100644
index 9494ed75bd0..00000000000
--- a/test/CodeGen/R600/local-memory.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-
-@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
-
-
-; Check that the LDS size emitted correctly
-; EG: .long 166120
-; EG-NEXT: .long 128
-; SI: .long 47180
-; SI-NEXT: .long 71560
-; CI: .long 47180
-; CI-NEXT: .long 38792
-
-; FUNC-LABEL: {{^}}local_memory:
-
-; EG: LDS_WRITE
-; SI-NOT: s_wqm_b64
-; SI: ds_write_b32
-
-; GROUP_BARRIER must be the last instruction in a clause
-; EG: GROUP_BARRIER
-; EG-NEXT: ALU clause
-; SI: s_barrier
-
-; EG: LDS_READ_RET
-; SI: ds_read_b32 {{v[0-9]+}},
-
-define void @local_memory(i32 addrspace(1)* %out) {
-entry:
-  %y.i = call i32 @llvm.r600.read.tidig.x() #0
-  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
-  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
-  %add = add nsw i32 %y.i, 1
-  %cmp = icmp eq i32 %add, 16
-  %.add = select i1 %cmp, i32 0, i32 %add
-  call void @llvm.AMDGPU.barrier.local()
-  %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
-  %0 = load i32, i32 addrspace(3)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
-  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
-  ret void
-}
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare void @llvm.AMDGPU.barrier.local()
-
-attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/loop-address.ll b/test/CodeGen/R600/loop-address.ll
deleted file mode 100644
index f60d574497d..00000000000
--- a/test/CodeGen/R600/loop-address.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood < %s | FileCheck %s
-
-;CHECK: ALU_PUSH
-;CHECK: LOOP_START_DX10 @11
-;CHECK: LOOP_BREAK @10
-;CHECK: POP @10
-
-define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 {
-entry:
-  %cmp5 = icmp sgt i32 %iterations, 0
-  br i1 %cmp5, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.body, %entry
-  %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ]
-  %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
-  %i.07 = add nsw i32 %i.07.in, -1
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06
-  store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4
-  %add = add nsw i32 %ai.06, 1
-  %exitcond = icmp eq i32 %add, %iterations
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-attributes #0 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
-
-!opencl.kernels = !{!0, !1, !2, !3}
-
-!0 = !{void (i32 addrspace(1)*, i32)* @loop_ge}
-!1 = !{null}
-!2 = !{null}
-!3 = !{null}
diff --git a/test/CodeGen/R600/loop-idiom.ll b/test/CodeGen/R600/loop-idiom.ll
deleted file mode 100644
index 5fd9806813c..00000000000
--- a/test/CodeGen/R600/loop-idiom.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
-
-
-; Make sure loop-idiom doesn't create memcpy or memset.  There are no library
-; implementations of these for R600.
-
-; FUNC: @no_memcpy
-; R600-NOT: {{^}}llvm.memcpy
-; SI-NOT: {{^}}llvm.memcpy
-define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
-entry:
-  %dest = alloca i8, i32 32
-  br label %for.body
-
-for.body:
-  %0 = phi i32 [0, %entry], [%4, %for.body]
-  %1 = getelementptr i8, i8 addrspace(3)* %in, i32 %0
-  %2 = getelementptr i8, i8* %dest, i32 %0
-  %3 = load i8, i8 addrspace(3)* %1
-  store i8 %3, i8* %2
-  %4 = add i32 %0, 1
-  %5 = icmp eq i32 %4, %size
-  br i1 %5, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-; FUNC: @no_memset
-; R600-NOT: {{^}}llvm.memset
-; R600-NOT: {{^}}memset_pattern16:
-; SI-NOT: {{^}}llvm.memset
-; SI-NOT: {{^}}memset_pattern16:
-define void @no_memset(i32 %size) {
-entry:
-  %dest = alloca i8, i32 32
-  br label %for.body
-
-for.body:
-  %0 = phi i32 [0, %entry], [%2, %for.body]
-  %1 = getelementptr i8, i8* %dest, i32 %0
-  store i8 0, i8* %1
-  %2 = add i32 %0, 1
-  %3 = icmp eq i32 %2, %size
-  br i1 %3, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/CodeGen/R600/lshl.ll b/test/CodeGen/R600/lshl.ll
deleted file mode 100644
index 9ac988d38d1..00000000000
--- a/test/CodeGen/R600/lshl.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
-
-define void @test(i32 %p) {
-   %i = mul i32 %p, 2
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/lshr.ll b/test/CodeGen/R600/lshr.ll
deleted file mode 100644
index 50e444ac26b..00000000000
--- a/test/CodeGen/R600/lshr.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
-
-define void @test(i32 %p) {
-   %i = udiv i32 %p, 2
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/m0-spill.ll b/test/CodeGen/R600/m0-spill.ll
deleted file mode 100644
index 1dddc85f775..00000000000
--- a/test/CodeGen/R600/m0-spill.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-@lds = external addrspace(3) global [64 x float]
-
-; CHECK-LABEL: {{^}}main:
-; CHECK-NOT: v_readlane_b32 m0
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
-main_body:
-  %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
-  %cmp = fcmp ueq float 0.0, %4
-  br i1 %cmp, label %if, label %else
-
-if:
-  %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0
-  %lds_data = load float, float addrspace(3)* %lds_ptr
-  br label %endif
-
-else:
-  %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
-  br label %endif
-
-endif:
-  %export = phi float [%lds_data, %if], [%interp, %else]
-  %5 = call i32 @llvm.SI.packf16(float %export, float %export)
-  %6 = bitcast i32 %5 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6)
-  ret void
-}
-
-declare float @llvm.SI.fs.constant(i32, i32, i32) readnone
-
-declare i32 @llvm.SI.packf16(float, float) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/mad-combine.ll b/test/CodeGen/R600/mad-combine.ll
deleted file mode 100644
index bc071628ead..00000000000
--- a/test/CodeGen/R600/mad-combine.ll
+++ /dev/null
@@ -1,567 +0,0 @@
-; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
-
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
-
-; Make sure we don't form mad with denormals
-; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare float @llvm.fabs.f32(float) #0
-declare float @llvm.fma.f32(float, float, float) #0
-declare float @llvm.fmuladd.f32(float, float, float) #0
-
-; (fadd (fmul x, y), z) -> (fma x, y, z)
-; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
-
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
-
-; SI-DENORM-SLOWFMAF-NOT: v_fma
-; SI-DENORM-SLOWFMAF-NOT: v_mad
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
-
-; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-
-  %mul = fmul float %a, %b
-  %fma = fadd float %mul, %c
-  store float %fma, float addrspace(1)* %gep.out
-  ret void
-}
-
-; (fadd (fmul x, y), z) -> (fma x, y, z)
-; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
-
-; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
-; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
-
-; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
-; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
-; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
-
-; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI: s_endpgm
-define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-  %d = load float, float addrspace(1)* %gep.3
-
-  %mul = fmul float %a, %b
-  %fma0 = fadd float %mul, %c
-  %fma1 = fadd float %mul, %d
-
-  store float %fma0, float addrspace(1)* %gep.out.0
-  store float %fma1, float addrspace(1)* %gep.out.1
-  ret void
-}
-
-; (fadd x, (fmul y, z)) -> (fma y, z, x)
-; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
-
-; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-
-  %mul = fmul float %a, %b
-  %fma = fadd float %c, %mul
-  store float %fma, float addrspace(1)* %gep.out
-  ret void
-}
-
-; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
-
-; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-
-  %mul = fmul float %a, %b
-  %fma = fsub float %mul, %c
-  store float %fma, float addrspace(1)* %gep.out
-  ret void
-}
-
-; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
-
-; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
-; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
-
-; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
-; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
-
-; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI: s_endpgm
-define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-  %d = load float, float addrspace(1)* %gep.3
-
-  %mul = fmul float %a, %b
-  %fma0 = fsub float %mul, %c
-  %fma1 = fsub float %mul, %d
-  store float %fma0, float addrspace(1)* %gep.out.0
-  store float %fma1, float addrspace(1)* %gep.out.1
-  ret void
-}
-
-; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
-; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
-
-; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-
-  %mul = fmul float %a, %b
-  %fma = fsub float %c, %mul
-  store float %fma, float addrspace(1)* %gep.out
-  ret void
-}
-
-; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
-; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-
-; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
-; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
-
-; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
-; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
-
-; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI: s_endpgm
-define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-  %d = load float, float addrspace(1)* %gep.3
-
-  %mul = fmul float %a, %b
-  %fma0 = fsub float %c, %mul
-  %fma1 = fsub float %d, %mul
-  store float %fma0, float addrspace(1)* %gep.out.0
-  store float %fma1, float addrspace(1)* %gep.out.1
-  ret void
-}
-
-; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
-
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]]
-
-; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-
-  %mul = fmul float %a, %b
-  %mul.neg = fsub float -0.0, %mul
-  %fma = fsub float %mul.neg, %c
-
-  store float %fma, float addrspace(1)* %gep.out
-  ret void
-}
-
-; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-
-; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
-; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
-
-; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
-; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
-; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]]
-
-; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI: s_endpgm
-define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-  %d = load float, float addrspace(1)* %gep.3
-
-  %mul = fmul float %a, %b
-  %mul.neg = fsub float -0.0, %mul
-  %fma0 = fsub float %mul.neg, %c
-  %fma1 = fsub float %mul.neg, %d
-
-  store float %fma0, float addrspace(1)* %gep.out.0
-  store float %fma1, float addrspace(1)* %gep.out.1
-  ret void
-}
-
-; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-
-; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
-; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
-
-; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
-; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
-
-; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI: s_endpgm
-define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
-
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-  %d = load float, float addrspace(1)* %gep.3
-
-  %mul = fmul float %a, %b
-  %mul.neg = fsub float -0.0, %mul
-  %fma0 = fsub float %mul.neg, %c
-  %fma1 = fsub float %mul, %d
-
-  store float %fma0, float addrspace(1)* %gep.out.0
-  store float %fma1, float addrspace(1)* %gep.out.1
-  ret void
-}
-
-; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
-
-; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
-; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-
-; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
-; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
-
-; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]]
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
-
-; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
-  %z = load float, float addrspace(1)* %gep.2
-  %u = load float, float addrspace(1)* %gep.3
-  %v = load float, float addrspace(1)* %gep.4
-
-  %tmp0 = fmul float %u, %v
-  %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
-  %tmp2 = fsub float %tmp1, %z
-
-  store float %tmp2, float addrspace(1)* %gep.out
-  ret void
-}
-
-; fold (fsub x, (fma y, z, (fmul u, v)))
-;   -> (fma (fneg y), z, (fma (fneg u), v, x))
-
-; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
-; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-
-; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
-; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
-
-; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]]
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
-
-; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
-  %z = load float, float addrspace(1)* %gep.2
-  %u = load float, float addrspace(1)* %gep.3
-  %v = load float, float addrspace(1)* %gep.4
-
-  %tmp0 = fmul float %u, %v
-  %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
-  %tmp2 = fsub float %x, %tmp1
-
-  store float %tmp2, float addrspace(1)* %gep.out
-  ret void
-}
-
-; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
-
-; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
-; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-
-; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
-
-; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
-
-; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
-  %z = load float, float addrspace(1)* %gep.2
-  %u = load float, float addrspace(1)* %gep.3
-  %v = load float, float addrspace(1)* %gep.4
-
-  %tmp0 = fmul float %u, %v
-  %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
-  %tmp2 = fsub float %tmp1, %z
-
-  store float %tmp2, float addrspace(1)* %gep.out
-  ret void
-}
-
-; fold (fsub x, (fmuladd y, z, (fmul u, v)))
-;   -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))
-
-; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
-; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-
-; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
-
-; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
-; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]]
-
-; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
-  %z = load float, float addrspace(1)* %gep.2
-  %u = load float, float addrspace(1)* %gep.3
-  %v = load float, float addrspace(1)* %gep.4
-
-  %tmp0 = fmul float %u, %v
-  %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
-  %tmp2 = fsub float %x, %tmp1
-
-  store float %tmp2, float addrspace(1)* %gep.out
-  ret void
-}
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/mad-sub.ll b/test/CodeGen/R600/mad-sub.ll
deleted file mode 100644
index aa4194ff610..00000000000
--- a/test/CodeGen/R600/mad-sub.ll
+++ /dev/null
@@ -1,215 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare float @llvm.fabs.f32(float) #0
-
-; FUNC-LABEL: {{^}}mad_sub_f32:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
-; SI: buffer_store_dword [[RESULT]]
-define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %tid.ext = sext i32 %tid to i64
-  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
-  %add1 = add i64 %tid.ext, 1
-  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
-  %add2 = add i64 %tid.ext, 2
-  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
-  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
-  %a = load float, float addrspace(1)* %gep0, align 4
-  %b = load float, float addrspace(1)* %gep1, align 4
-  %c = load float, float addrspace(1)* %gep2, align 4
-  %mul = fmul float %a, %b
-  %sub = fsub float %mul, %c
-  store float %sub, float addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}mad_sub_inv_f32:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
-; SI: buffer_store_dword [[RESULT]]
-define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %tid.ext = sext i32 %tid to i64
-  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
-  %add1 = add i64 %tid.ext, 1
-  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
-  %add2 = add i64 %tid.ext, 2
-  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
-  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
-  %a = load float, float addrspace(1)* %gep0, align 4
-  %b = load float, float addrspace(1)* %gep1, align 4
-  %c = load float, float addrspace(1)* %gep2, align 4
-  %mul = fmul float %a, %b
-  %sub = fsub float %c, %mul
-  store float %sub, float addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}mad_sub_f64:
-; SI: v_mul_f64
-; SI: v_add_f64
-define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %tid.ext = sext i32 %tid to i64
-  %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext
-  %add1 = add i64 %tid.ext, 1
-  %gep1 = getelementptr double, double addrspace(1)* %ptr, i64 %add1
-  %add2 = add i64 %tid.ext, 2
-  %gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2
-  %outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext
-  %a = load double, double addrspace(1)* %gep0, align 8
-  %b = load double, double addrspace(1)* %gep1, align 8
-  %c = load double, double addrspace(1)* %gep2, align 8
-  %mul = fmul double %a, %b
-  %sub = fsub double %mul, %c
-  store double %sub, double addrspace(1)* %outgep, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}mad_sub_fabs_f32:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
-; SI: buffer_store_dword [[RESULT]]
-define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %tid.ext = sext i32 %tid to i64
-  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
-  %add1 = add i64 %tid.ext, 1
-  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
-  %add2 = add i64 %tid.ext, 2
-  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
-  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
-  %a = load float, float addrspace(1)* %gep0, align 4
-  %b = load float, float addrspace(1)* %gep1, align 4
-  %c = load float, float addrspace(1)* %gep2, align 4
-  %c.abs = call float @llvm.fabs.f32(float %c) #0
-  %mul = fmul float %a, %b
-  %sub = fsub float %mul, %c.abs
-  store float %sub, float addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}mad_sub_fabs_inv_f32:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
-; SI: buffer_store_dword [[RESULT]]
-define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %tid.ext = sext i32 %tid to i64
-  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
-  %add1 = add i64 %tid.ext, 1
-  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
-  %add2 = add i64 %tid.ext, 2
-  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
-  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
-  %a = load float, float addrspace(1)* %gep0, align 4
-  %b = load float, float addrspace(1)* %gep1, align 4
-  %c = load float, float addrspace(1)* %gep2, align 4
-  %c.abs = call float @llvm.fabs.f32(float %c) #0
-  %mul = fmul float %a, %b
-  %sub = fsub float %c.abs, %mul
-  store float %sub, float addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}neg_neg_mad_f32:
-; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %tid.ext = sext i32 %tid to i64
-  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
-  %add1 = add i64 %tid.ext, 1
-  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
-  %add2 = add i64 %tid.ext, 2
-  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
-  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
-  %a = load float, float addrspace(1)* %gep0, align 4
-  %b = load float, float addrspace(1)* %gep1, align 4
-  %c = load float, float addrspace(1)* %gep2, align 4
-  %nega = fsub float -0.000000e+00, %a
-  %negb = fsub float -0.000000e+00, %b
-  %mul = fmul float %nega, %negb
-  %sub = fadd float %mul, %c
-  store float %sub, float addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}mad_fabs_sub_f32:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
-; SI: buffer_store_dword [[RESULT]]
-define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
-  %tid.ext = sext i32 %tid to i64
-  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
-  %add1 = add i64 %tid.ext, 1
-  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
-  %add2 = add i64 %tid.ext, 2
-  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
-  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
-  %a = load float, float addrspace(1)* %gep0, align 4
-  %b = load float, float addrspace(1)* %gep1, align 4
-  %c = load float, float addrspace(1)* %gep2, align 4
-  %b.abs = call float @llvm.fabs.f32(float %b) #0
-  %mul = fmul float %a, %b.abs
-  %sub = fsub float %mul, %c
-  store float %sub, float addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fsub_c_fadd_a_a:
-; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
-; SI: buffer_store_dword [[RESULT]]
-define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
-
-  %add = fadd float %r1, %r1
-  %r3 = fsub float %r2, %add
-
-  store float %r3, float addrspace(1)* %gep.out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fsub_fadd_a_a_c:
-; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
-; SI: buffer_store_dword [[RESULT]]
-define void @fsub_fadd_a_a_c(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
-
-  %add = fadd float %r1, %r1
-  %r3 = fsub float %add, %r2
-
-  store float %r3, float addrspace(1)* %gep.out
-  ret void
-}
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/mad_int24.ll b/test/CodeGen/R600/mad_int24.ll
deleted file mode 100644
index 86d75a63ca4..00000000000
--- a/test/CodeGen/R600/mad_int24.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-
-declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}i32_mad24:
-; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
-; EG: MULLO_INT
-; Make sure we aren't masking the inputs.
-; CM-NOT: AND
-; CM: MULADD_INT24
-; SI-NOT: and
-; SI: v_mad_i32_i24
-define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
-entry:
-  %0 = shl i32 %a, 8
-  %a_24 = ashr i32 %0, 8
-  %1 = shl i32 %b, 8
-  %b_24 = ashr i32 %1, 8
-  %2 = mul i32 %a_24, %b_24
-  %3 = add i32 %2, %c
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: @test_imul24
-; SI: v_mad_i32_i24
-define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
-  %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
-  %add = add i32 %mul, %src2
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/mad_uint24.ll b/test/CodeGen/R600/mad_uint24.ll
deleted file mode 100644
index 95fe3411959..00000000000
--- a/test/CodeGen/R600/mad_uint24.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-
-; FUNC-LABEL: {{^}}u32_mad24:
-; EG: MULADD_UINT24
-; SI: v_mad_u32_u24
-
-define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
-entry:
-  %0 = shl i32 %a, 8
-  %a_24 = lshr i32 %0, 8
-  %1 = shl i32 %b, 8
-  %b_24 = lshr i32 %1, 8
-  %2 = mul i32 %a_24, %b_24
-  %3 = add i32 %2, %c
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i16_mad24:
-; The order of A and B does not matter.
-; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
-; The result must be sign-extended
-; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
-; EG: 16
-; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
-
-define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
-entry:
-  %0 = mul i16 %a, %b
-  %1 = add i16 %0, %c
-  %2 = sext i16 %1 to i32
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i8_mad24:
-; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
-; The result must be sign-extended
-; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
-; EG: 8
-; SI: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
-
-define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
-entry:
-  %0 = mul i8 %a, %b
-  %1 = add i8 %0, %c
-  %2 = sext i8 %1 to i32
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; This tests for a bug where the mad_u24 pattern matcher would call
-; SimplifyDemandedBits on the first operand of the mul instruction
-; assuming that the pattern would be matched to a 24-bit mad.  This
-; led to some instructions being incorrectly erased when the entire
-; 24-bit mad pattern wasn't being matched.
-
-; Check that the select instruction is not deleted.
-; FUNC-LABEL: {{^}}i24_i32_i32_mad:
-; EG: CNDE_INT
-; SI: v_cndmask
-define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
-entry:
-  %0 = ashr i32 %a, 8
-  %1 = icmp ne i32 %c, 0
-  %2 = select i1 %1, i32 %0, i32 34
-  %3 = mul i32 %2, %c
-  %4 = add i32 %3, %d
-  store i32 %4, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/madak.ll b/test/CodeGen/R600/madak.ll
deleted file mode 100644
index 933bb016d2c..00000000000
--- a/test/CodeGen/R600/madak.ll
+++ /dev/null
@@ -1,193 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
-
-; FIXME: Enable VI
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-declare float @llvm.fabs.f32(float) nounwind readnone
-
-; GCN-LABEL: {{^}}madak_f32:
-; GCN: buffer_load_dword [[VA:v[0-9]+]]
-; GCN: buffer_load_dword [[VB:v[0-9]+]]
-; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000
-define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
-  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %in.a.gep, align 4
-  %b = load float, float addrspace(1)* %in.b.gep, align 4
-
-  %mul = fmul float %a, %b
-  %madak = fadd float %mul, 10.0
-  store float %madak, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; Make sure this is only folded with one use. This is a code size
-; optimization and if we fold the immediate multiple times, we'll undo
-; it.
-
-; GCN-LABEL: {{^}}madak_2_use_f32:
-; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]]
-; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VC]], [[VK]]
-; GCN: s_endpgm
-define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-  %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2
-
-  %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %in.gep.0, align 4
-  %b = load float, float addrspace(1)* %in.gep.1, align 4
-  %c = load float, float addrspace(1)* %in.gep.2, align 4
-
-  %mul0 = fmul float %a, %b
-  %mul1 = fmul float %a, %c
-  %madak0 = fadd float %mul0, 10.0
-  %madak1 = fadd float %mul1, 10.0
-
-  store float %madak0, float addrspace(1)* %out.gep.0, align 4
-  store float %madak1, float addrspace(1)* %out.gep.1, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
-; GCN: buffer_load_dword [[VA:v[0-9]+]]
-; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
-define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %in.a.gep, align 4
-
-  %mul = fmul float 4.0, %a
-  %madak = fadd float %mul, 10.0
-  store float %madak, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; Make sure nothing weird happens with a value that is also allowed as
-; an inline immediate.
-
-; GCN-LABEL: {{^}}madak_inline_imm_f32:
-; GCN: buffer_load_dword [[VA:v[0-9]+]]
-; GCN: buffer_load_dword [[VB:v[0-9]+]]
-; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
-define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
-  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %in.a.gep, align 4
-  %b = load float, float addrspace(1)* %in.b.gep, align 4
-
-  %mul = fmul float %a, %b
-  %madak = fadd float %mul, 4.0
-  store float %madak, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; We can't use an SGPR when forming madak
-; GCN-LABEL: {{^}}s_v_madak_f32:
-; GCN: s_load_dword [[SB:s[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
-; GCN-NOT: v_madak_f32
-; GCN: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]]
-define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %in.a.gep, align 4
-
-  %mul = fmul float %a, %b
-  %madak = fadd float %mul, 10.0
-  store float %madak, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; GCN-LABEL: @v_s_madak_f32
-; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
-; GCN-NOT: v_madak_f32
-; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]]
-define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %b = load float, float addrspace(1)* %in.b.gep, align 4
-
-  %mul = fmul float %a, %b
-  %madak = fadd float %mul, 10.0
-  store float %madak, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}s_s_madak_f32:
-; GCN-NOT: v_madak_f32
-; GCN: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
-  %mul = fmul float %a, %b
-  %madak = fadd float %mul, 10.0
-  store float %madak, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
-; GCN: buffer_load_dword [[VA:v[0-9]+]]
-; GCN: buffer_load_dword [[VB:v[0-9]+]]
-; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
-; GCN: s_endpgm
-define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
-  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %in.a.gep, align 4
-  %b = load float, float addrspace(1)* %in.b.gep, align 4
-
-  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
-
-  %mul = fmul float %a.fabs, %b
-  %madak = fadd float %mul, 10.0
-  store float %madak, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
-; GCN: buffer_load_dword [[VA:v[0-9]+]]
-; GCN: buffer_load_dword [[VB:v[0-9]+]]
-; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
-; GCN: s_endpgm
-define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
-  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %in.a.gep, align 4
-  %b = load float, float addrspace(1)* %in.b.gep, align 4
-
-  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
-
-  %mul = fmul float %a, %b.fabs
-  %madak = fadd float %mul, 10.0
-  store float %madak, float addrspace(1)* %out.gep, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/madmk.ll b/test/CodeGen/R600/madmk.ll
deleted file mode 100644
index ba7bb221a99..00000000000
--- a/test/CodeGen/R600/madmk.ll
+++ /dev/null
@@ -1,205 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-declare float @llvm.fabs.f32(float) nounwind readnone
-
-; GCN-LABEL: {{^}}madmk_f32:
-; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN: v_madmk_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %mul = fmul float %a, 10.0
-  %madmk = fadd float %mul, %b
-  store float %madmk, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}madmk_2_use_f32:
-; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VB]]
-; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VC]]
-; GCN: s_endpgm
-define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-  %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2
-
-  %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-
-  %a = load float, float addrspace(1)* %in.gep.0, align 4
-  %b = load float, float addrspace(1)* %in.gep.1, align 4
-  %c = load float, float addrspace(1)* %in.gep.2, align 4
-
-  %mul0 = fmul float %a, 10.0
-  %mul1 = fmul float %a, 10.0
-  %madmk0 = fadd float %mul0, %b
-  %madmk1 = fadd float %mul1, %c
-
-  store float %madmk0, float addrspace(1)* %out.gep.0, align 4
-  store float %madmk1, float addrspace(1)* %out.gep.1, align 4
-  ret void
-}
-
-; We don't get any benefit if the constant is an inline immediate.
-; GCN-LABEL: {{^}}madmk_inline_imm_f32:
-; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN: v_mad_f32 {{v[0-9]+}}, 4.0, [[VA]], [[VB]]
-define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %mul = fmul float %a, 4.0
-  %madmk = fadd float %mul, %b
-  store float %madmk, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}s_s_madmk_f32:
-; GCN-NOT: v_madmk_f32
-; GCN: v_mad_f32
-; GCN: s_endpgm
-define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %mul = fmul float %a, 10.0
-  %madmk = fadd float %mul, %b
-  store float %madmk, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}v_s_madmk_f32:
-; GCN-NOT: v_madmk_f32
-; GCN: v_mad_f32
-; GCN: s_endpgm
-define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.0, align 4
-
-  %mul = fmul float %a, 10.0
-  %madmk = fadd float %mul, %b
-  store float %madmk, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}scalar_vector_madmk_f32:
-; GCN-NOT: v_madmk_f32
-; GCN: v_mad_f32
-; GCN: s_endpgm
-define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %b = load float, float addrspace(1)* %gep.0, align 4
-
-  %mul = fmul float %a, 10.0
-  %madmk = fadd float %mul, %b
-  store float %madmk, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}no_madmk_src0_modifier_f32:
-; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
-define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
-
-  %mul = fmul float %a.fabs, 10.0
-  %madmk = fadd float %mul, %b
-  store float %madmk, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}no_madmk_src2_modifier_f32:
-; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}|
-define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
-
-  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
-
-  %mul = fmul float %a, 10.0
-  %madmk = fadd float %mul, %b.fabs
-  store float %madmk, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}madmk_add_inline_imm_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]]
-; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0
-define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %a = load float, float addrspace(1)* %gep.0, align 4
-
-  %mul = fmul float %a, 10.0
-  %madmk = fadd float %mul, 2.0
-  store float %madmk, float addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}kill_madmk_verifier_error:
-; SI: s_xor_b64
-; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x472aee8c
-; SI: s_or_b64
-define void @kill_madmk_verifier_error() nounwind {
-bb:
-  br label %bb2
-
-bb1:                                              ; preds = %bb2
-  ret void
-
-bb2:                                              ; preds = %bb6, %bb
-  %tmp = phi float [ undef, %bb ], [ %tmp8, %bb6 ]
-  %tmp3 = fsub float undef, %tmp
-  %tmp5 = fcmp oeq float %tmp3, 1.000000e+04
-  br i1 %tmp5, label %bb1, label %bb6
-
-bb6:                                              ; preds = %bb2
-  %tmp4 = fmul float %tmp, undef
-  %tmp7 = fmul float %tmp4, 0x40E55DD180000000
-  %tmp8 = fadd float %tmp7, undef
-  br label %bb2
-}
diff --git a/test/CodeGen/R600/max-literals.ll b/test/CodeGen/R600/max-literals.ll
deleted file mode 100644
index c357524b140..00000000000
--- a/test/CodeGen/R600/max-literals.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; CHECK-LABEL: {{^}}main:
-; CHECK: ADD *
-
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
-main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = extractelement <4 x float> %reg1, i32 2
-  %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = extractelement <4 x float> %reg2, i32 0
-  %5 = fadd float %0, 2.0
-  %6 = fadd float %1, 3.0
-  %7 = fadd float %2, 4.0
-  %8 = fadd float %3, 5.0
-  %9 = bitcast float %4 to i32
-  %10 = mul i32 %9, 6
-  %11 = bitcast i32 %10 to float
-  %12 = insertelement <4 x float> undef, float %5, i32 0
-  %13 = insertelement <4 x float> %12, float %6, i32 1
-  %14 = insertelement <4 x float> %13, float %7, i32 2
-  %15 = insertelement <4 x float> %14, float %8, i32 3
-  %16 = insertelement <4 x float> %15, float %11, i32 3
-
-  %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
-  %18 = insertelement <4 x float> undef, float %17, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}main2:
-; CHECK-NOT: ADD *
-
-define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
-main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = extractelement <4 x float> %reg1, i32 2
-  %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = extractelement <4 x float> %reg2, i32 0
-  %5 = fadd float %0, 2.0
-  %6 = fadd float %1, 3.0
-  %7 = fadd float %2, 4.0
-  %8 = fadd float %3, 2.0
-  %9 = bitcast float %4 to i32
-  %10 = mul i32 %9, 6
-  %11 = bitcast i32 %10 to float
-  %12 = insertelement <4 x float> undef, float %5, i32 0
-  %13 = insertelement <4 x float> %12, float %6, i32 1
-  %14 = insertelement <4 x float> %13, float %7, i32 2
-  %15 = insertelement <4 x float> %14, float %8, i32 3
-  %16 = insertelement <4 x float> %15, float %11, i32 3
-
-  %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
-  %18 = insertelement <4 x float> undef, float %17, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
-  ret void
-}
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/max.ll b/test/CodeGen/R600/max.ll
deleted file mode 100644
index fef3e2f0a21..00000000000
--- a/test/CodeGen/R600/max.ll
+++ /dev/null
@@ -1,168 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; FUNC-LABEL: @v_test_imax_sge_i32
-; SI: v_max_i32_e32
-define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %cmp = icmp sge i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: @s_test_imax_sge_i32
-; SI: s_max_i32
-define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %cmp = icmp sge i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_test_imax_sge_imm_i32:
-; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
-define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %cmp = icmp sge i32 %a, 9
-  %val = select i1 %cmp, i32 %a, i32 9
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_i32:
-; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
-define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %cmp = icmp sgt i32 %a, 9
-  %val = select i1 %cmp, i32 %a, i32 9
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @v_test_imax_sgt_i32
-; SI: v_max_i32_e32
-define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %cmp = icmp sgt i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: @s_test_imax_sgt_i32
-; SI: s_max_i32
-define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %cmp = icmp sgt i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @v_test_umax_uge_i32
-; SI: v_max_u32_e32
-define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %cmp = icmp uge i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: @s_test_umax_uge_i32
-; SI: s_max_u32
-define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %cmp = icmp uge i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @v_test_umax_ugt_i32
-; SI: v_max_u32_e32
-define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %cmp = icmp ugt i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: @s_test_umax_ugt_i32
-; SI: s_max_u32
-define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %cmp = icmp ugt i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; Make sure redundant and removed
-; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_max_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
-; SI-NEXT: buffer_store_dword [[VMIN]]
-define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
-  %a.ext = zext i16 %a to i32
-  %b.ext = zext i16 %b to i32
-  %cmp = icmp ugt i32 %a.ext, %b.ext
-  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
-  %mask = and i32 %val, 65535
-  store i32 %mask, i32 addrspace(1)* %out
-  ret void
-}
-
-; Make sure redundant sign_extend_inreg removed.
-
-; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_max_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
-; SI-NEXT: buffer_store_dword [[VMIN]]
-define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
-  %a.ext = sext i16 %a to i32
-  %b.ext = sext i16 %b to i32
-  %cmp = icmp sgt i32 %a.ext, %b.ext
-  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
-  %shl = shl i32 %val, 16
-  %sextinreg = ashr i32 %shl, 16
-  store i32 %sextinreg, i32 addrspace(1)* %out
-  ret void
-}
-
-; FIXME: Should get match min/max through extends inserted by
-; legalization.
-
-; FUNC-LABEL: {{^}}s_test_imin_sge_i16:
-; SI: s_sext_i32_i16
-; SI: s_sext_i32_i16
-; SI: v_cmp_ge_i32_e32
-; SI: v_cndmask_b32
-define void @s_test_imin_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
-  %cmp = icmp sge i16 %a, %b
-  %val = select i1 %cmp, i16 %a, i16 %b
-  store i16 %val, i16 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/max3.ll b/test/CodeGen/R600/max3.ll
deleted file mode 100644
index cfb94b272e5..00000000000
--- a/test/CodeGen/R600/max3.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; FUNC-LABEL: @v_test_imax3_sgt_i32
-; SI: v_max3_i32
-define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %c = load i32, i32 addrspace(1)* %gep2, align 4
-  %icmp0 = icmp sgt i32 %a, %b
-  %i0 = select i1 %icmp0, i32 %a, i32 %b
-  %icmp1 = icmp sgt i32 %i0, %c
-  %i1 = select i1 %icmp1, i32 %i0, i32 %c
-  store i32 %i1, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @v_test_umax3_ugt_i32
-; SI: v_max3_u32
-define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %c = load i32, i32 addrspace(1)* %gep2, align 4
-  %icmp0 = icmp ugt i32 %a, %b
-  %i0 = select i1 %icmp0, i32 %a, i32 %b
-  %icmp1 = icmp ugt i32 %i0, %c
-  %i1 = select i1 %icmp1, i32 %i0, i32 %c
-  store i32 %i1, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/merge-stores.ll b/test/CodeGen/R600/merge-stores.ll
deleted file mode 100644
index dbf9d4481ff..00000000000
--- a/test/CodeGen/R600/merge-stores.ll
+++ /dev/null
@@ -1,536 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-
-; Run with devices with different unaligned load restrictions.
-
-; TODO: Vector element tests
-; TODO: Non-zero base offset for load and store combinations
-; TODO: Same base addrspacecasted
-
-
-; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: s_endpgm
-define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-
-  store i8 123, i8 addrspace(1)* %out.gep.1
-  store i8 456, i8 addrspace(1)* %out, align 2
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: s_endpgm
-define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-
-  store i8 123, i8 addrspace(1)* %out.gep.1
-  store i8 456, i8 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
-; GCN: buffer_store_dword v
-define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
-
-  store i16 123, i16 addrspace(1)* %out.gep.1
-  store i16 456, i16 addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
-; GCN: buffer_store_dword v
-define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
-
-  store i16 0, i16 addrspace(1)* %out.gep.1
-  store i16 0, i16 addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: s_endpgm
-define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
-
-  store i16 123, i16 addrspace(1)* %out.gep.1
-  store i16 456, i16 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
-; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
-; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
-; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
-; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
-; GCN: buffer_store_dwordx2
-define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
-  store float 1.0, float addrspace(1)* %out.gep.1.bc
-  store i32 456, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
-; GCN: buffer_store_dwordx2
-define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
-  store i32 123, i32 addrspace(1)* %out.gep.1.bc
-  store float 4.0, float addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
-; GCN: buffer_store_dwordx4
-define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out.gep.2
-  store i32 333, i32 addrspace(1)* %out.gep.3
-  store i32 1234, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
-; XGCN: buffer_store_dwordx4
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dwordx2 v
-define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
-
-  store float 8.0, float addrspace(1)* %out
-  store float 1.0, float addrspace(1)* %out.gep.1
-  store float 2.0, float addrspace(1)* %out.gep.2
-  store float 4.0, float addrspace(1)* %out.gep.3
-  ret void
-}
-
-; First store is out of order. Because of order of combines, the
-; consecutive store fails because only some of the stores have been
-; replaced with integer constant stores, and then won't merge because
-; the types are different.
-
-; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
-; XGCN: buffer_store_dwordx4
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
-
-  store float 1.0, float addrspace(1)* %out.gep.1
-  store float 2.0, float addrspace(1)* %out.gep.2
-  store float 4.0, float addrspace(1)* %out.gep.3
-  store float 8.0, float addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dword
-; SI-NOT: buffer_store_dword
-; GCN: s_endpgm
-define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out.gep.2
-  store i32 1234, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
-; XGCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx2
-; GCN: buffer_store_dwordx2
-define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
-
-  store i64 123, i64 addrspace(1)* %out.gep.1
-  store i64 456, i64 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
-; XGCN: buffer_store_dwordx4
-; XGCN: buffer_store_dwordx4
-
-; GCN: buffer_store_dwordx2
-; GCN: buffer_store_dwordx2
-; GCN: buffer_store_dwordx2
-; GCN: buffer_store_dwordx2
-define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
-  %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
-  %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
-
-  store i64 123, i64 addrspace(1)* %out.gep.1
-  store i64 456, i64 addrspace(1)* %out.gep.2
-  store i64 333, i64 addrspace(1)* %out.gep.3
-  store i64 1234, i64 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
-; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; GCN: buffer_store_dwordx2 [[LOAD]]
-define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-
-  %lo = load i32, i32 addrspace(1)* %in
-  %hi = load i32, i32 addrspace(1)* %in.gep.1
-
-  store i32 %lo, i32 addrspace(1)* %out
-  store i32 %hi, i32 addrspace(1)* %out.gep.1
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
-; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
-
-  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %lo = load i32, i32 addrspace(1)* %in.gep.0
-  %hi = load i32, i32 addrspace(1)* %in.gep.1
-
-  store i32 %lo, i32 addrspace(1)* %out.gep.0
-  store i32 %hi, i32 addrspace(1)* %out.gep.1
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
-; GCN: buffer_load_dword v
-; GCN: buffer_load_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-
-  %lo = load i32, i32 addrspace(1)* %in
-  %hi = load i32, i32 addrspace(1)* %in.gep.1
-
-  store i32 %hi, i32 addrspace(1)* %out
-  store i32 %lo, i32 addrspace(1)* %out.gep.1
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
-; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
-
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
-
-  store i32 %x, i32 addrspace(1)* %out
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %w, i32 addrspace(1)* %out.gep.3
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
-; SI-DAG: buffer_load_dwordx2
-; SI-DAG: buffer_load_dword v
-; GCN: s_waitcnt
-; SI-DAG: buffer_store_dword v
-; SI-DAG: buffer_store_dwordx2 v
-; GCN: s_endpgm
-define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-
-  store i32 %x, i32 addrspace(1)* %out
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
-; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
-
-  %x = load float, float addrspace(1)* %in
-  %y = load float, float addrspace(1)* %in.gep.1
-  %z = load float, float addrspace(1)* %in.gep.2
-  %w = load float, float addrspace(1)* %in.gep.3
-
-  store float %x, float addrspace(1)* %out
-  store float %y, float addrspace(1)* %out.gep.1
-  store float %z, float addrspace(1)* %out.gep.2
-  store float %w, float addrspace(1)* %out.gep.3
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
-; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
-; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
-define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
-  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
-
-  %x = load i32, i32 addrspace(1)* %in.gep.0
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
-
-  store i32 %x, i32 addrspace(1)* %out.gep.0
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %w, i32 addrspace(1)* %out.gep.3
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
-; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; GCN: s_barrier
-; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
-
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
-
-  ; Make sure the barrier doesn't stop this
-  tail call void @llvm.AMDGPU.barrier.local() #1
-
-  store i32 %w, i32 addrspace(1)* %out.gep.3
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %x, i32 addrspace(1)* %out
-
-  ret void
-}
-
-; TODO: Re-packing of loaded register required. Maybe an IR pass
-; should catch this?
-
-; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
-; GCN: buffer_load_dword v
-; GCN: buffer_load_dword v
-; GCN: buffer_load_dword v
-; GCN: buffer_load_dword v
-; GCN: s_barrier
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
-
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
-
-  ; Make sure the barrier doesn't stop this
-  tail call void @llvm.AMDGPU.barrier.local() #1
-
-  store i32 %w, i32 addrspace(1)* %out
-  store i32 %z, i32 addrspace(1)* %out.gep.1
-  store i32 %y, i32 addrspace(1)* %out.gep.2
-  store i32 %x, i32 addrspace(1)* %out.gep.3
-
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
-; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
-; GCN: buffer_store_dword [[LOAD]]
-; GCN: s_endpgm
-define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
-  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
-  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
-
-  %x = load i8, i8 addrspace(1)* %in, align 4
-  %y = load i8, i8 addrspace(1)* %in.gep.1
-  %z = load i8, i8 addrspace(1)* %in.gep.2
-  %w = load i8, i8 addrspace(1)* %in.gep.3
-
-  store i8 %x, i8 addrspace(1)* %out, align 4
-  store i8 %y, i8 addrspace(1)* %out.gep.1
-  store i8 %z, i8 addrspace(1)* %out.gep.2
-  store i8 %w, i8 addrspace(1)* %out.gep.3
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: s_endpgm
-define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
-  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
-  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
-
-  %x = load i8, i8 addrspace(1)* %in
-  %y = load i8, i8 addrspace(1)* %in.gep.1
-  %z = load i8, i8 addrspace(1)* %in.gep.2
-  %w = load i8, i8 addrspace(1)* %in.gep.3
-
-  store i8 %x, i8 addrspace(1)* %out
-  store i8 %y, i8 addrspace(1)* %out.gep.1
-  store i8 %z, i8 addrspace(1)* %out.gep.2
-  store i8 %w, i8 addrspace(1)* %out.gep.3
-  ret void
-}
-
-; This works once AA is enabled on the subtarget
-; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
-; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; XGCN: buffer_store_dwordx4 [[LOAD]]
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
-
-  %x = extractelement <4 x i32> %vec, i32 0
-  %y = extractelement <4 x i32> %vec, i32 1
-  %z = extractelement <4 x i32> %vec, i32 2
-  %w = extractelement <4 x i32> %vec, i32 3
-
-  store i32 %x, i32 addrspace(1)* %out
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %w, i32 addrspace(1)* %out.gep.3
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
-; GCN: ds_write_b8
-; GCN: ds_write_b8
-; GCN: s_endpgm
-define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
-
-  store i8 123, i8 addrspace(3)* %out.gep.1
-  store i8 456, i8 addrspace(3)* %out, align 2
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
-; GCN-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
-; GCN-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
-; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
-define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
-
-  store i32 123, i32 addrspace(3)* %out.gep.1
-  store i32 456, i32 addrspace(3)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
-; GCN: ds_write_b32
-; GCN: ds_write_b32
-; GCN: ds_write_b32
-; GCN: ds_write_b32
-define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
-
-  store i32 123, i32 addrspace(3)* %out.gep.1
-  store i32 456, i32 addrspace(3)* %out.gep.2
-  store i32 333, i32 addrspace(3)* %out.gep.3
-  store i32 1234, i32 addrspace(3)* %out
-  ret void
-}
-
-declare void @llvm.AMDGPU.barrier.local() #1
-
-attributes #0 = { nounwind }
-attributes #1 = { noduplicate nounwind }
diff --git a/test/CodeGen/R600/min.ll b/test/CodeGen/R600/min.ll
deleted file mode 100644
index 0332d1a8e40..00000000000
--- a/test/CodeGen/R600/min.ll
+++ /dev/null
@@ -1,189 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; FUNC-LABEL: @v_test_imin_sle_i32
-; SI: v_min_i32_e32
-define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %cmp = icmp sle i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: @s_test_imin_sle_i32
-; SI: s_min_i32
-define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %cmp = icmp sle i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @v_test_imin_slt_i32
-; SI: v_min_i32_e32
-define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %cmp = icmp slt i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: @s_test_imin_slt_i32
-; SI: s_min_i32
-define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %cmp = icmp slt i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32:
-; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
-define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %cmp = icmp slt i32 %a, 8
-  %val = select i1 %cmp, i32 %a, i32 8
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32:
-; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
-define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %cmp = icmp sle i32 %a, 8
-  %val = select i1 %cmp, i32 %a, i32 8
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @v_test_umin_ule_i32
-; SI: v_min_u32_e32
-define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %cmp = icmp ule i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: @s_test_umin_ule_i32
-; SI: s_min_u32
-define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %cmp = icmp ule i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @v_test_umin_ult_i32
-; SI: v_min_u32_e32
-define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %cmp = icmp ult i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: @s_test_umin_ult_i32
-; SI: s_min_u32
-define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %cmp = icmp ult i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @v_test_umin_ult_i32_multi_use
-; SI-NOT: v_min
-; SI: v_cmp_lt_u32
-; SI-NEXT: v_cndmask_b32
-; SI-NOT: v_min
-; SI: s_endpgm
-define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep0 = getelementptr i32, i32 addrspace(1)* %out0, i32 %tid
-  %outgep1 = getelementptr i1, i1 addrspace(1)* %out1, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %cmp = icmp ult i32 %a, %b
-  %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep0, align 4
-  store i1 %cmp, i1 addrspace(1)* %outgep1
-  ret void
-}
-
-; Make sure redundant and removed
-; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
-; SI-NEXT: buffer_store_dword [[VMIN]]
-define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
-  %a.ext = zext i16 %a to i32
-  %b.ext = zext i16 %b to i32
-  %cmp = icmp ult i32 %a.ext, %b.ext
-  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
-  %mask = and i32 %val, 65535
-  store i32 %mask, i32 addrspace(1)* %out
-  ret void
-}
-
-; Make sure redundant sign_extend_inreg removed.
-
-; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
-; SI-NEXT: buffer_store_dword [[VMIN]]
-define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
-  %a.ext = sext i16 %a to i32
-  %b.ext = sext i16 %b to i32
-  %cmp = icmp slt i32 %a.ext, %b.ext
-  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
-  %shl = shl i32 %val, 16
-  %sextinreg = ashr i32 %shl, 16
-  store i32 %sextinreg, i32 addrspace(1)* %out
-  ret void
-}
-
-; FIXME: Should get match min/max through extends inserted by
-; legalization.
-
-; FUNC-LABEL: {{^}}s_test_imin_sle_i16:
-; SI: s_sext_i32_i16
-; SI: s_sext_i32_i16
-; SI: v_cmp_le_i32_e32
-; SI: v_cndmask_b32
-define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
-  %cmp = icmp sle i16 %a, %b
-  %val = select i1 %cmp, i16 %a, i16 %b
-  store i16 %val, i16 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/min3.ll b/test/CodeGen/R600/min3.ll
deleted file mode 100644
index 38ef46d1bdd..00000000000
--- a/test/CodeGen/R600/min3.ll
+++ /dev/null
@@ -1,111 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; FUNC-LABEL: @v_test_imin3_slt_i32
-; SI: v_min3_i32
-define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %c = load i32, i32 addrspace(1)* %gep2, align 4
-  %icmp0 = icmp slt i32 %a, %b
-  %i0 = select i1 %icmp0, i32 %a, i32 %b
-  %icmp1 = icmp slt i32 %i0, %c
-  %i1 = select i1 %icmp1, i32 %i0, i32 %c
-  store i32 %i1, i32 addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: @v_test_umin3_ult_i32
-; SI: v_min3_u32
-define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %c = load i32, i32 addrspace(1)* %gep2, align 4
-  %icmp0 = icmp ult i32 %a, %b
-  %i0 = select i1 %icmp0, i32 %a, i32 %b
-  %icmp1 = icmp ult i32 %i0, %c
-  %i1 = select i1 %icmp1, i32 %i0, i32 %c
-  store i32 %i1, i32 addrspace(1)* %outgep, align 4
-  ret void
-}
-
-; FUNC-LABEL: @v_test_umin_umin_umin
-; SI: v_min_i32
-; SI: v_min3_i32
-define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %tid2 = mul i32 %tid, 2
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
-
-  %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2
-  %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2
-  %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2
-
-  %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
-
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %c = load i32, i32 addrspace(1)* %gep2, align 4
-  %d = load i32, i32 addrspace(1)* %gep3, align 4
-
-  %icmp0 = icmp slt i32 %a, %b
-  %i0 = select i1 %icmp0, i32 %a, i32 %b
-
-  %icmp1 = icmp slt i32 %c, %d
-  %i1 = select i1 %icmp1, i32 %c, i32 %d
-
-  %icmp2 = icmp slt i32 %i0, %i1
-  %i2 = select i1 %icmp2, i32 %i0, i32 %i1
-
-  store i32 %i2, i32 addrspace(1)* %outgep1, align 4
-  ret void
-}
-
-; FUNC-LABEL: @v_test_umin3_2_uses
-; SI-NOT: v_min3
-define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %tid2 = mul i32 %tid, 2
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
-
-  %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2
-  %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2
-  %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2
-
-  %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
-
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %c = load i32, i32 addrspace(1)* %gep2, align 4
-  %d = load i32, i32 addrspace(1)* %gep3, align 4
-
-  %icmp0 = icmp slt i32 %a, %b
-  %i0 = select i1 %icmp0, i32 %a, i32 %b
-
-  %icmp1 = icmp slt i32 %c, %d
-  %i1 = select i1 %icmp1, i32 %c, i32 %d
-
-  %icmp2 = icmp slt i32 %i0, %c
-  %i2 = select i1 %icmp2, i32 %i0, i32 %c
-
-  store i32 %i2, i32 addrspace(1)* %outgep0, align 4
-  store i32 %i0, i32 addrspace(1)* %outgep1, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/missing-store.ll b/test/CodeGen/R600/missing-store.ll
deleted file mode 100644
index 4af9cdf1b96..00000000000
--- a/test/CodeGen/R600/missing-store.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
-
-@ptr_load = addrspace(3) global i32 addrspace(2)* undef, align 8
-
-; Make sure when the load from %ptr2 is folded the chain isn't lost,
-; resulting in losing the store to gptr
-
-; FUNC-LABEL: {{^}}missing_store_reduced:
-; SI: ds_read_b64
-; SI: buffer_store_dword
-; SI: buffer_load_dword
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
-
-  store i32 99, i32 addrspace(1)* %gptr, align 4
-  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
-
-  store i32 %tmp2, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-attributes #0 = { nounwind }
-
diff --git a/test/CodeGen/R600/mubuf.ll b/test/CodeGen/R600/mubuf.ll
deleted file mode 100644
index b19163f294e..00000000000
--- a/test/CodeGen/R600/mubuf.ll
+++ /dev/null
@@ -1,183 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
-
-declare i32 @llvm.r600.read.tidig.x() readnone
-
-;;;==========================================================================;;;
-;;; MUBUF LOAD TESTS
-;;;==========================================================================;;;
-
-; MUBUF load with an immediate byte offset that fits into 12-bits
-; CHECK-LABEL: {{^}}mubuf_load0:
-; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
-define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1
-  %1 = load i32, i32 addrspace(1)* %0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; MUBUF load with the largest possible immediate offset
-; CHECK-LABEL: {{^}}mubuf_load1:
-; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
-define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
-entry:
-  %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095
-  %1 = load i8, i8 addrspace(1)* %0
-  store i8 %1, i8 addrspace(1)* %out
-  ret void
-}
-
-; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
-; CHECK-LABEL: {{^}}mubuf_load2:
-; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
-; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0
-define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024
-  %1 = load i32, i32 addrspace(1)* %0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; MUBUF load with a 12-bit immediate offset and a register offset
-; CHECK-LABEL: {{^}}mubuf_load3:
-; CHECK-NOT: ADD
-; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0
-define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
-entry:
-  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset
-  %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
-  %2 = load i32, i32 addrspace(1)* %1
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}soffset_max_imm:
-; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc
-define void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
-main_body:
-  %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
-  %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
-  %tmp2 = shl i32 %6, 2
-  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
-  %tmp4 = add i32 %6, 16
-  %tmp5 = bitcast float 0.0 to i32
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
-  ret void
-}
-
-; Make sure immediates that aren't inline constants don't get folded into
-; the soffset operand.
-; FIXME: for this test we should be smart enough to shift the immediate into
-; the offset field.
-; CHECK-LABEL: {{^}}soffset_no_fold:
-; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41
-; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc
-define void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
-main_body:
-  %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
-  %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
-  %tmp2 = shl i32 %6, 2
-  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
-  %tmp4 = add i32 %6, 16
-  %tmp5 = bitcast float 0.0 to i32
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
-  ret void
-}
-
-;;;==========================================================================;;;
-;;; MUBUF STORE TESTS
-;;;==========================================================================;;;
-
-; MUBUF store with an immediate byte offset that fits into 12-bits
-; CHECK-LABEL: {{^}}mubuf_store0:
-; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0
-define void @mubuf_store0(i32 addrspace(1)* %out) {
-entry:
-  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1
-  store i32 0, i32 addrspace(1)* %0
-  ret void
-}
-
-; MUBUF store with the largest possible immediate offset
-; CHECK-LABEL: {{^}}mubuf_store1:
-; CHECK: buffer_store_byte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0
-
-define void @mubuf_store1(i8 addrspace(1)* %out) {
-entry:
-  %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095
-  store i8 0, i8 addrspace(1)* %0
-  ret void
-}
-
-; MUBUF store with an immediate byte offset that doesn't fit into 12-bits
-; CHECK-LABEL: {{^}}mubuf_store2:
-; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
-; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0
-define void @mubuf_store2(i32 addrspace(1)* %out) {
-entry:
-  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024
-  store i32 0, i32 addrspace(1)* %0
-  ret void
-}
-
-; MUBUF store with a 12-bit immediate offset and a register offset
-; CHECK-LABEL: {{^}}mubuf_store3:
-; CHECK-NOT: ADD
-; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0
-define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
-entry:
-  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset
-  %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
-  store i32 0, i32 addrspace(1)* %1
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_sgpr_ptr:
-; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0
-define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
-  store i32 99, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
-; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
-define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10
-  store i32 99, i32 addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
-; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
-; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
-  store i32 99, i32 addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic:
-; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
-; CHECK: buffer_atomic_add v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
-  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst
-  ret void
-}
-
-; CHECK-LABEL: {{^}}store_vgpr_ptr:
-; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() readnone
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  store i32 99, i32 addrspace(1)* %out.gep, align 4
-  ret void
-}
-
-declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #3
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-
-attributes #1 = { "ShaderType"="2" "unsafe-fp-math"="true" }
-attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
deleted file mode 100644
index 94e0f96b323..00000000000
--- a/test/CodeGen/R600/mul.ll
+++ /dev/null
@@ -1,200 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; mul24 and mad24 are affected
-
-; FUNC-LABEL: {{^}}test_mul_v2i32:
-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
-  %result = mul <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_mul_v4i32:
-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
-  %result = mul <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32:
-; SI: s_load_dword
-; SI: s_load_dword
-; SI: s_mul_i32
-; SI: buffer_store_dword
-define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
-  %mul = mul i64 %b, %a
-  %trunc = trunc i64 %mul to i32
-  store i32 %trunc, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32:
-; SI: s_load_dword
-; SI: s_load_dword
-; SI: v_mul_lo_i32
-; SI: buffer_store_dword
-define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
-  %b = load i64, i64 addrspace(1)* %bptr, align 8
-  %mul = mul i64 %b, %a
-  %trunc = trunc i64 %mul to i32
-  store i32 %trunc, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
-; 32-bits of both arguments are sign bits.
-; FUNC-LABEL: {{^}}mul64_sext_c:
-; EG-DAG: MULLO_INT
-; EG-DAG: MULHI_INT
-; SI-DAG: s_mul_i32
-; SI-DAG: v_mul_hi_i32
-define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = sext i32 %in to i64
-  %1 = mul i64 %0, 80
-  store i64 %1, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_mul64_sext_c:
-; EG-DAG: MULLO_INT
-; EG-DAG: MULHI_INT
-; SI-DAG: v_mul_lo_i32
-; SI-DAG: v_mul_hi_i32
-; SI: s_endpgm
-define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %ext = sext i32 %val to i64
-  %mul = mul i64 %ext, 80
-  store i64 %mul, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm:
-; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
-; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
-; SI: s_endpgm
-define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %ext = sext i32 %val to i64
-  %mul = mul i64 %ext, 9
-  store i64 %mul, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_mul_i32:
-; SI: s_load_dword [[SRC0:s[0-9]+]],
-; SI: s_load_dword [[SRC1:s[0-9]+]],
-; SI: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]]
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
-; SI: s_endpgm
-define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %mul = mul i32 %a, %b
-  store i32 %mul, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_mul_i32:
-; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
-  %result = mul i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-; A standard 64-bit multiply.  The expansion should be around 6 instructions.
-; It would be difficult to match the expansion correctly without writing
-; a really complicated list of FileCheck expressions.  I don't want
-; to confuse people who may 'break' this test with a correct optimization,
-; so this test just uses FUNC-LABEL to make sure the compiler does not
-; crash with a 'failed to select' error.
-
-; FUNC-LABEL: {{^}}s_mul_i64:
-define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %mul = mul i64 %a, %b
-  store i64 %mul, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_mul_i64:
-; SI: v_mul_lo_i32
-define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
-  %b = load i64, i64 addrspace(1)* %bptr, align 8
-  %mul = mul i64 %a, %b
-  store i64 %mul, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}mul32_in_branch:
-; SI: s_mul_i32
-define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
-entry:
-  %0 = icmp eq i32 %a, 0
-  br i1 %0, label %if, label %else
-
-if:
-  %1 = load i32, i32 addrspace(1)* %in
-  br label %endif
-
-else:
-  %2 = mul i32 %a, %b
-  br label %endif
-
-endif:
-  %3 = phi i32 [%1, %if], [%2, %else]
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}mul64_in_branch:
-; SI-DAG: s_mul_i32
-; SI-DAG: v_mul_hi_u32
-; SI: s_endpgm
-define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
-entry:
-  %0 = icmp eq i64 %a, 0
-  br i1 %0, label %if, label %else
-
-if:
-  %1 = load i64, i64 addrspace(1)* %in
-  br label %endif
-
-else:
-  %2 = mul i64 %a, %b
-  br label %endif
-
-endif:
-  %3 = phi i64 [%1, %if], [%2, %else]
-  store i64 %3, i64 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/mul_int24.ll b/test/CodeGen/R600/mul_int24.ll
deleted file mode 100644
index 7609dcc87af..00000000000
--- a/test/CodeGen/R600/mul_int24.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-
-; FUNC-LABEL: {{^}}i32_mul24:
-; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
-; EG: MULLO_INT
-; Make sure we are not masking the inputs
-; CM-NOT: AND
-; CM: MUL_INT24
-; SI-NOT: and
-; SI: v_mul_i32_i24
-define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %0 = shl i32 %a, 8
-  %a_24 = ashr i32 %0, 8
-  %1 = shl i32 %b, 8
-  %b_24 = ashr i32 %1, 8
-  %2 = mul i32 %a_24, %b_24
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/mul_uint24.ll b/test/CodeGen/R600/mul_uint24.ll
deleted file mode 100644
index e640a7cd69f..00000000000
--- a/test/CodeGen/R600/mul_uint24.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-
-; FUNC-LABEL: {{^}}u32_mul24:
-; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI: v_mul_u32_u24
-
-define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %0 = shl i32 %a, 8
-  %a_24 = lshr i32 %0, 8
-  %1 = shl i32 %b, 8
-  %b_24 = lshr i32 %1, 8
-  %2 = mul i32 %a_24, %b_24
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i16_mul24:
-; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
-; The result must be sign-extended
-; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
-; EG: 16
-; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
-define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) {
-entry:
-  %0 = mul i16 %a, %b
-  %1 = sext i16 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i8_mul24:
-; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
-; The result must be sign-extended
-; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
-; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
-
-define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) {
-entry:
-  %0 = mul i8 %a, %b
-  %1 = sext i8 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; Multiply with 24-bit inputs and 64-bit output
-; FUNC_LABEL: {{^}}mul24_i64:
-; EG; MUL_UINT24
-; EG: MULHI
-; SI: v_mul_u32_u24
-; FIXME: SI support 24-bit mulhi
-; SI: v_mul_hi_u32
-define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %0 = shl i64 %a, 40
-  %a_24 = lshr i64 %0, 40
-  %1 = shl i64 %b, 40
-  %b_24 = lshr i64 %1, 40
-  %2 = mul i64 %a_24, %b_24
-  store i64 %2, i64 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/mulhu.ll b/test/CodeGen/R600/mulhu.ll
deleted file mode 100644
index 29b0944a553..00000000000
--- a/test/CodeGen/R600/mulhu.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
-;CHECK: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
-;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-
-define void @test(i32 %p) {
-   %i = udiv i32 %p, 3
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/no-initializer-constant-addrspace.ll b/test/CodeGen/R600/no-initializer-constant-addrspace.ll
deleted file mode 100644
index 9a814b579de..00000000000
--- a/test/CodeGen/R600/no-initializer-constant-addrspace.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -o /dev/null %s
-; RUN: llc -march=amdgcn -mcpu=tonga -o /dev/null %s
-; RUN: llc -march=r600 -mcpu=cypress -o /dev/null %s
-
-@extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
-
-; FUNC-LABEL: {{^}}load_extern_const_init:
-define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
-  %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-@undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4
-
-; FUNC-LABEL: {{^}}load_undef_const_init:
-define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
-  %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/no-shrink-extloads.ll b/test/CodeGen/R600/no-shrink-extloads.ll
deleted file mode 100644
index e4328ecbaca..00000000000
--- a/test/CodeGen/R600/no-shrink-extloads.ll
+++ /dev/null
@@ -1,191 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; Make sure we don't turn the 32-bit argument load into a 16-bit
-; load. There aren't extending scalar lods, so that would require
-; using a buffer_load instruction.
-
-; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:
-; SI: s_load_dword s
-; SI: buffer_store_short v
-define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
-  %trunc = trunc i32 %arg to i16
-  store i16 %trunc, i16 addrspace(1)* %out
-  ret void
-}
-
-; It should be OK (and probably performance neutral) to reduce this,
-; but we don't know if the load is uniform yet.
-
-; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:
-; SI: buffer_load_dword v
-; SI: buffer_store_short v
-define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
-  %load = load i32, i32 addrspace(1)* %gep.in
-  %trunc = trunc i32 %load to i16
-  store i16 %trunc, i16 addrspace(1)* %gep.out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:
-; SI: s_load_dword s
-; SI: buffer_store_byte v
-define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
-  %trunc = trunc i32 %arg to i8
-  store i8 %trunc, i8 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:
-; SI: buffer_load_dword v
-; SI: buffer_store_byte v
-define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %load = load i32, i32 addrspace(1)* %gep.in
-  %trunc = trunc i32 %load to i8
-  store i8 %trunc, i8 addrspace(1)* %gep.out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:
-; SI: s_load_dword s
-; SI: buffer_store_byte v
-define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
-  %trunc = trunc i32 %arg to i1
-  store i1 %trunc, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:
-; SI: buffer_load_dword v
-; SI: buffer_store_byte v
-define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid
-  %load = load i32, i32 addrspace(1)* %gep.in
-  %trunc = trunc i32 %load to i1
-  store i1 %trunc, i1 addrspace(1)* %gep.out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
-; SI: s_load_dword s
-; SI: buffer_store_dword v
-define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
-  %trunc = trunc i64 %arg to i32
-  store i32 %trunc, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:
-; SI: buffer_load_dword v
-; SI: buffer_store_dword v
-define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %load = load i64, i64 addrspace(1)* %gep.in
-  %trunc = trunc i64 %load to i32
-  store i32 %trunc, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
-; SI: s_load_dword s
-; SI: buffer_store_dword v
-define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
-  %srl = lshr i64 %arg, 32
-  %trunc = trunc i64 %srl to i32
-  store i32 %trunc, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:
-; SI: buffer_load_dword v
-; SI: buffer_store_dword v
-define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %load = load i64, i64 addrspace(1)* %gep.in
-  %srl = lshr i64 %load, 32
-  %trunc = trunc i64 %srl to i32
-  store i32 %trunc, i32 addrspace(1)* %gep.out
-  ret void
-}
-
-; Might as well reduce to 8-bit loads.
-; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:
-; SI: s_load_dword s
-; SI: buffer_store_byte v
-define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
-  %trunc = trunc i16 %arg to i8
-  store i8 %trunc, i8 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:
-; SI: buffer_load_ubyte v
-; SI: buffer_store_byte v
-define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %load = load i16, i16 addrspace(1)* %gep.in
-  %trunc = trunc i16 %load to i8
-  store i8 %trunc, i8 addrspace(1)* %gep.out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
-; SI: s_load_dword s
-; SI: buffer_store_byte v
-define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
-  %srl = lshr i64 %arg, 32
-  %trunc = trunc i64 %srl to i8
-  store i8 %trunc, i8 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:
-; SI: buffer_load_dword v
-; SI: buffer_store_byte v
-define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %load = load i64, i64 addrspace(1)* %gep.in
-  %srl = lshr i64 %load, 32
-  %trunc = trunc i64 %srl to i8
-  store i8 %trunc, i8 addrspace(1)* %gep.out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
-; SI: s_load_dword s
-; SI: buffer_store_byte v
-define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
-  %trunc = trunc i64 %arg to i8
-  store i8 %trunc, i8 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:
-; SI: buffer_load_dword v
-; SI: buffer_store_byte v
-define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %load = load i64, i64 addrspace(1)* %gep.in
-  %trunc = trunc i64 %load to i8
-  store i8 %trunc, i8 addrspace(1)* %gep.out
-  ret void
-}
diff --git a/test/CodeGen/R600/operand-folding.ll b/test/CodeGen/R600/operand-folding.ll
deleted file mode 100644
index 816755efb07..00000000000
--- a/test/CodeGen/R600/operand-folding.ll
+++ /dev/null
@@ -1,113 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-
-; CHECK-LABEL: {{^}}fold_sgpr:
-; CHECK: v_add_i32_e32 v{{[0-9]+}}, s
-define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
-entry:
-  %tmp0 = icmp ne i32 %fold, 0
-  br i1 %tmp0, label %if, label %endif
-
-if:
-  %id = call i32 @llvm.r600.read.tidig.x()
-  %offset = add i32 %fold, %id
-  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
-  store i32 0, i32 addrspace(1)* %tmp1
-  br label %endif
-
-endif:
-  ret void
-}
-
-; CHECK-LABEL: {{^}}fold_imm:
-; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5
-define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
-entry:
-  %fold = add i32 3, 2
-  %tmp0 = icmp ne i32 %cmp, 0
-  br i1 %tmp0, label %if, label %endif
-
-if:
-  %id = call i32 @llvm.r600.read.tidig.x()
-  %val = or i32 %id, %fold
-  store i32 %val, i32 addrspace(1)* %out
-  br label %endif
-
-endif:
-  ret void
-}
-
-; CHECK-LABEL: {{^}}fold_64bit_constant_add:
-; CHECK-NOT: s_mov_b64
-; FIXME: It would be better if we could use v_add here and drop the extra
-; v_mov_b32 instructions.
-; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1
-; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0
-; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]
-; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
-; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},
-
-define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
-entry:
-  %tmp0 = add i64 %val, 1
-  store i64 %tmp0, i64 addrspace(1)* %out
-  ret void
-}
-
-; Inline constants should always be folded.
-
-; CHECK-LABEL: {{^}}vector_inline:
-; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
-; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
-; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
-; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
-
-define void @vector_inline(<4 x i32> addrspace(1)* %out) {
-entry:
-  %tmp0 = call i32 @llvm.r600.read.tidig.x()
-  %tmp1 = add i32 %tmp0, 1
-  %tmp2 = add i32 %tmp0, 2
-  %tmp3 = add i32 %tmp0, 3
-  %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
-  %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1
-  %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
-  %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
-  %tmp4 = xor <4 x i32> <i32 5, i32 5, i32 5, i32 5>, %vec3
-  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; Immediates with one use should be folded
-; CHECK-LABEL: {{^}}imm_one_use:
-; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}}
-
-define void @imm_one_use(i32 addrspace(1)* %out) {
-entry:
-  %tmp0 = call i32 @llvm.r600.read.tidig.x()
-  %tmp1 = xor i32 %tmp0, 100
-  store i32 %tmp1, i32 addrspace(1)* %out
-  ret void
-}
-; CHECK-LABEL: {{^}}vector_imm:
-; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64
-; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
-; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
-; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
-; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
-
-define void @vector_imm(<4 x i32> addrspace(1)* %out) {
-entry:
-  %tmp0 = call i32 @llvm.r600.read.tidig.x()
-  %tmp1 = add i32 %tmp0, 1
-  %tmp2 = add i32 %tmp0, 2
-  %tmp3 = add i32 %tmp0, 3
-  %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
-  %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1
-  %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
-  %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
-  %tmp4 = xor <4 x i32> <i32 100, i32 100, i32 100, i32 100>, %vec3
-  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-declare i32 @llvm.r600.read.tidig.x() #0
-attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/operand-spacing.ll b/test/CodeGen/R600/operand-spacing.ll
deleted file mode 100644
index 20420a84de6..00000000000
--- a/test/CodeGen/R600/operand-spacing.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
-
-; Make sure there isn't an extra space between the instruction name and first operands.
-
-; GCN-LABEL: {{^}}add_f32:
-; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
-; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
-; GCN: buffer_store_dword [[RESULT]],
-define void @add_f32(float addrspace(1)* %out, float %a, float %b) {
-  %result = fadd float %a, %b
-  store float %result, float addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
deleted file mode 100644
index 1c04090b407..00000000000
--- a/test/CodeGen/R600/or.ll
+++ /dev/null
@@ -1,178 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-; FUNC-LABEL: {{^}}or_v2i32:
-; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
-  %result = or <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}or_v4i32:
-; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
-  %result = or <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}scalar_or_i32:
-; SI: s_or_b32
-define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-  %or = or i32 %a, %b
-  store i32 %or, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}vector_or_i32:
-; SI: v_or_b32_e32 v{{[0-9]}}
-define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
-  %loada = load i32, i32 addrspace(1)* %a
-  %or = or i32 %loada, %b
-  store i32 %or, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}scalar_or_literal_i32:
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f
-define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
-  %or = or i32 %a, 99999
-  store i32 %or, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}vector_or_literal_i32:
-; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
-define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
-  %loada = load i32, i32 addrspace(1)* %a, align 4
-  %or = or i32 %loada, 65535
-  store i32 %or, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32:
-; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}}
-define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
-  %loada = load i32, i32 addrspace(1)* %a, align 4
-  %or = or i32 %loada, 4
-  store i32 %or, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}scalar_or_i64:
-; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
-; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
-
-; SI: s_or_b64
-define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
-  %or = or i64 %a, %b
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}vector_or_i64:
-; SI: v_or_b32_e32 v{{[0-9]}}
-; SI: v_or_b32_e32 v{{[0-9]}}
-define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 8
-  %loadb = load i64, i64 addrspace(1)* %a, align 8
-  %or = or i64 %loada, %loadb
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}scalar_vector_or_i64:
-; SI: v_or_b32_e32 v{{[0-9]}}
-; SI: v_or_b32_e32 v{{[0-9]}}
-define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
-  %loada = load i64, i64 addrspace(1)* %a
-  %or = or i64 %loada, %b
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}vector_or_i64_loadimm:
-; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f
-; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x146f
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: s_endpgm
-define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 8
-  %or = or i64 %loada, 22470723082367
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; FIXME: The or 0 should really be removed.
-; FUNC-LABEL: {{^}}vector_or_i64_imm:
-; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI: v_or_b32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]]
-; SI: v_or_b32_e32 {{v[0-9]+}}, 0, {{.*}}
-; SI: s_endpgm
-define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 8
-  %or = or i64 %loada, 8
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}trunc_i64_or_to_i32:
-; SI: s_load_dword s[[SREG0:[0-9]+]]
-; SI: s_load_dword s[[SREG1:[0-9]+]]
-; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
-define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
-  %add = or i64 %b, %a
-  %trunc = trunc i64 %add to i32
-  store i32 %trunc, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}or_i1:
-; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
-
-; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
-define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
-  %a = load float, float addrspace(1)* %in0
-  %b = load float, float addrspace(1)* %in1
-  %acmp = fcmp oge float %a, 0.000000e+00
-  %bcmp = fcmp oge float %b, 0.000000e+00
-  %or = or i1 %acmp, %bcmp
-  %result = zext i1 %or to i32
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_or_i1:
-; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
-define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
-  %cmp0 = icmp eq i32 %a, %b
-  %cmp1 = icmp eq i32 %c, %d
-  %or = or i1 %cmp0, %cmp1
-  store i1 %or, i1 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/packetizer.ll b/test/CodeGen/R600/packetizer.ll
deleted file mode 100644
index 49a7c0df748..00000000000
--- a/test/CodeGen/R600/packetizer.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
-
-; CHECK: {{^}}test:
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z
-; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W
-
-define void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
-entry:
-  %shl = sub i32 32, %e
-  %x = add i32 %x_arg, 1
-  %x.0 = shl i32 %x, %shl
-  %x.1 = lshr i32 %x, %e
-  %x.2 = or i32 %x.0, %x.1
-  %y = add i32 %y_arg, 1
-  %y.0 = shl i32 %y, %shl
-  %y.1 = lshr i32 %y, %e
-  %y.2 = or i32 %y.0, %y.1
-  %z = add i32 %z_arg, 1
-  %z.0 = shl i32 %z, %shl
-  %z.1 = lshr i32 %z, %e
-  %z.2 = or i32 %z.0, %z.1
-  %w = add i32 %w_arg, 1
-  %w.0 = shl i32 %w, %shl
-  %w.1 = lshr i32 %w, %e
-  %w.2 = or i32 %w.0, %w.1
-  %xy = or i32 %x.2, %y.2
-  %zw = or i32 %z.2, %w.2
-  %xyzw = or i32 %xy, %zw
-  store i32 %xyzw, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/parallelandifcollapse.ll b/test/CodeGen/R600/parallelandifcollapse.ll
deleted file mode 100644
index f32b044198a..00000000000
--- a/test/CodeGen/R600/parallelandifcollapse.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-; Function Attrs: nounwind
-; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s
-;
-; CFG flattening should use parallel-and mode to generate branch conditions and
-; then merge if-regions with the same bodies.
-;
-; CHECK: AND_INT
-; CHECK-NEXT: AND_INT
-; CHECK-NEXT: OR_INT
-
-; FIXME: For some reason having the allocas here allowed the flatten cfg pass
-; to do its transfomation, however now that we are using local memory for
-; allocas, the transformation isn't happening.
-
-define void @_Z9chk1D_512v() #0 {
-entry:
-  %a0 = alloca i32, align 4
-  %b0 = alloca i32, align 4
-  %c0 = alloca i32, align 4
-  %d0 = alloca i32, align 4
-  %a1 = alloca i32, align 4
-  %b1 = alloca i32, align 4
-  %c1 = alloca i32, align 4
-  %d1 = alloca i32, align 4
-  %data = alloca i32, align 4
-  %0 = load i32, i32* %a0, align 4
-  %1 = load i32, i32* %b0, align 4
-  %cmp = icmp ne i32 %0, %1
-  br i1 %cmp, label %land.lhs.true, label %if.end
-
-land.lhs.true:                                    ; preds = %entry
-  %2 = load i32, i32* %c0, align 4
-  %3 = load i32, i32* %d0, align 4
-  %cmp1 = icmp ne i32 %2, %3
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %land.lhs.true
-  store i32 1, i32* %data, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %land.lhs.true, %entry
-  %4 = load i32, i32* %a1, align 4
-  %5 = load i32, i32* %b1, align 4
-  %cmp2 = icmp ne i32 %4, %5
-  br i1 %cmp2, label %land.lhs.true3, label %if.end6
-
-land.lhs.true3:                                   ; preds = %if.end
-  %6 = load i32, i32* %c1, align 4
-  %7 = load i32, i32* %d1, align 4
-  %cmp4 = icmp ne i32 %6, %7
-  br i1 %cmp4, label %if.then5, label %if.end6
-
-if.then5:                                         ; preds = %land.lhs.true3
-  store i32 1, i32* %data, align 4
-  br label %if.end6
-
-if.end6:                                          ; preds = %if.then5, %land.lhs.true3, %if.end
-  ret void
-}
diff --git a/test/CodeGen/R600/parallelorifcollapse.ll b/test/CodeGen/R600/parallelorifcollapse.ll
deleted file mode 100644
index 1da1e91b8ab..00000000000
--- a/test/CodeGen/R600/parallelorifcollapse.ll
+++ /dev/null
@@ -1,66 +0,0 @@
-; Function Attrs: nounwind
-; RUN: llc < %s -march=r600 -mcpu=redwood  | FileCheck %s
-;
-; CFG flattening should use parallel-or to generate branch conditions and
-; then merge if-regions with the same bodies.
-
-; FIXME: For some reason having the allocas here allowed the flatten cfg pass
-; to do its transfomation, however now that we are using local memory for
-; allocas, the transformation isn't happening.
-; XFAIL: *
-;
-; CHECK: OR_INT
-; CHECK-NEXT: OR_INT
-; CHECK-NEXT: OR_INT
-define void @_Z9chk1D_512v() #0 {
-entry:
-  %a0 = alloca i32, align 4
-  %b0 = alloca i32, align 4
-  %c0 = alloca i32, align 4
-  %d0 = alloca i32, align 4
-  %a1 = alloca i32, align 4
-  %b1 = alloca i32, align 4
-  %c1 = alloca i32, align 4
-  %d1 = alloca i32, align 4
-  %data = alloca i32, align 4
-  %0 = load i32, i32* %a0, align 4
-  %1 = load i32, i32* %b0, align 4
-  %cmp = icmp ne i32 %0, %1
-  br i1 %cmp, label %land.lhs.true, label %if.else
-
-land.lhs.true:                                    ; preds = %entry
-  %2 = load i32, i32* %c0, align 4
-  %3 = load i32, i32* %d0, align 4
-  %cmp1 = icmp ne i32 %2, %3
-  br i1 %cmp1, label %if.then, label %if.else
-
-if.then:                                          ; preds = %land.lhs.true
-  br label %if.end
-
-if.else:                                          ; preds = %land.lhs.true, %entry
-  store i32 1, i32* %data, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %4 = load i32, i32* %a1, align 4
-  %5 = load i32, i32* %b1, align 4
-  %cmp2 = icmp ne i32 %4, %5
-  br i1 %cmp2, label %land.lhs.true3, label %if.else6
-
-land.lhs.true3:                                   ; preds = %if.end
-  %6 = load i32, i32* %c1, align 4
-  %7 = load i32, i32* %d1, align 4
-  %cmp4 = icmp ne i32 %6, %7
-  br i1 %cmp4, label %if.then5, label %if.else6
-
-if.then5:                                         ; preds = %land.lhs.true3
-  br label %if.end7
-
-if.else6:                                         ; preds = %land.lhs.true3, %if.end
-  store i32 1, i32* %data, align 4
-  br label %if.end7
-
-if.end7:                                          ; preds = %if.else6, %if.then5
-  ret void
-}
-
diff --git a/test/CodeGen/R600/predicate-dp4.ll b/test/CodeGen/R600/predicate-dp4.ll
deleted file mode 100644
index 6bc18759435..00000000000
--- a/test/CodeGen/R600/predicate-dp4.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman
-
-; CHECK-LABEL: {{^}}main:
-; CHECK: PRED_SETE_INT * Pred,
-; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one
-define void @main(<4 x float> inreg) #0 {
-main_body:
-  %1 = extractelement <4 x float> %0, i32 0
-  %2 = bitcast float %1 to i32
-  %3 = icmp eq i32 %2, 0
-  br i1 %3, label %IF, label %ENDIF
-
-IF:                                             ; preds = %main_body
-  %4 = call float @llvm.AMDGPU.dp4(<4 x float> %0, <4 x float> %0)
-  br label %ENDIF
-
-ENDIF:                                            ; preds = %IF, %main_body
-  %5 = phi float [%4, %IF], [0.000000e+00, %main_body]
-  %6 = insertelement <4 x float> undef, float %5, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %6, i32 0, i32 0)
-  ret void
-}
-
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-attributes #1 = { readnone }
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/predicates.ll b/test/CodeGen/R600/predicates.ll
deleted file mode 100644
index 0ce74d97ba8..00000000000
--- a/test/CodeGen/R600/predicates.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-; RUN: llc < %s -march=r600 -mattr=disable-irstructurizer -mcpu=redwood | FileCheck %s
-
-; These tests make sure the compiler is optimizing branches using predicates
-; when it is legal to do so.
-
-; CHECK: {{^}}simple_if:
-; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
-; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @simple_if(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = icmp sgt i32 %in, 0
-  br i1 %0, label %IF, label %ENDIF
-
-IF:
-  %1 = shl i32 %in, 1
-  br label %ENDIF
-
-ENDIF:
-  %2 = phi i32 [ %in, %entry ], [ %1, %IF ]
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}simple_if_else:
-; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
-; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = icmp sgt i32 %in, 0
-  br i1 %0, label %IF, label %ELSE
-
-IF:
-  %1 = shl i32 %in, 1
-  br label %ENDIF
-
-ELSE:
-  %2 = lshr i32 %in, 1
-  br label %ENDIF
-
-ENDIF:
-  %3 = phi i32 [ %1, %IF ], [ %2, %ELSE ]
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}nested_if:
-; CHECK: ALU_PUSH_BEFORE
-; CHECK: JUMP
-; CHECK: POP
-; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec
-; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
-; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @nested_if(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = icmp sgt i32 %in, 0
-  br i1 %0, label %IF0, label %ENDIF
-
-IF0:
-  %1 = add i32 %in, 10
-  %2 = icmp sgt i32 %1, 0
-  br i1 %2, label %IF1, label %ENDIF
-
-IF1:
-  %3 = shl i32  %1, 1
-  br label %ENDIF
-
-ENDIF:
-  %4 = phi i32 [%in, %entry], [%1, %IF0], [%3, %IF1]
-  store i32 %4, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}nested_if_else:
-; CHECK: ALU_PUSH_BEFORE
-; CHECK: JUMP
-; CHECK: POP
-; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec
-; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
-; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = icmp sgt i32 %in, 0
-  br i1 %0, label %IF0, label %ENDIF
-
-IF0:
-  %1 = add i32 %in, 10
-  %2 = icmp sgt i32 %1, 0
-  br i1 %2, label %IF1, label %ELSE1
-
-IF1:
-  %3 = shl i32  %1, 1
-  br label %ENDIF
-
-ELSE1:
-  %4 = lshr i32 %in, 1
-  br label %ENDIF
-
-ENDIF:
-  %5 = phi i32 [%in, %entry], [%3, %IF1], [%4, %ELSE1]
-  store i32 %5, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/private-memory-atomics.ll b/test/CodeGen/R600/private-memory-atomics.ll
deleted file mode 100644
index a008ac98a43..00000000000
--- a/test/CodeGen/R600/private-memory-atomics.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s
-
-; This works because promote allocas pass replaces these with LDS atomics.
-
-; Private atomics have no real use, but at least shouldn't crash on it.
-define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
-entry:
-  %tmp = alloca [2 x i32]
-  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
-  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
-  store i32 0, i32* %tmp1
-  store i32 1, i32* %tmp2
-  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
-  %tmp4 = atomicrmw add i32* %tmp3, i32 7 acq_rel
-  store i32 %tmp4, i32 addrspace(1)* %out
-  ret void
-}
-
-define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
-entry:
-  %tmp = alloca [2 x i32]
-  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
-  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
-  store i32 0, i32* %tmp1
-  store i32 1, i32* %tmp2
-  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
-  %tmp4 = cmpxchg i32* %tmp3, i32 0, i32 1 acq_rel monotonic
-  %val = extractvalue { i32, i1 } %tmp4, 0
-  store i32 %val, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/private-memory-broken.ll b/test/CodeGen/R600/private-memory-broken.ll
deleted file mode 100644
index 6b18a19f195..00000000000
--- a/test/CodeGen/R600/private-memory-broken.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=SI %s -o /dev/null 2>&1 | FileCheck %s
-; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=tonga %s -o /dev/null 2>&1 | FileCheck %s
-
-; Make sure promote alloca pass doesn't crash
-
-; CHECK: unsupported call
-
-declare i32 @foo(i32*) nounwind
-
-define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind {
-entry:
-  %tmp = alloca [2 x i32]
-  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
-  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
-  store i32 0, i32* %tmp1
-  store i32 1, i32* %tmp2
-  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
-  %val = call i32 @foo(i32* %tmp3) nounwind
-  store i32 %val, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
deleted file mode 100644
index 1c562978050..00000000000
--- a/test/CodeGen/R600/private-memory.ll
+++ /dev/null
@@ -1,313 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; FUNC-LABEL: {{^}}mova_same_clause:
-
-; R600: LDS_WRITE
-; R600: LDS_WRITE
-; R600: LDS_READ
-; R600: LDS_READ
-
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
-
-; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
-; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
-entry:
-  %stack = alloca [5 x i32], align 4
-  %0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
-  store i32 4, i32* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
-  store i32 5, i32* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
-  %2 = load i32, i32* %arrayidx10, align 4
-  store i32 %2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
-  %3 = load i32, i32* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %3, i32 addrspace(1)* %arrayidx13
-  ret void
-}
-
-; This test checks that the stack offset is calculated correctly for structs.
-; All register loads/stores should be optimized away, so there shouldn't be
-; any MOVA instructions.
-;
-; XXX: This generated code has unnecessary MOVs, we should be able to optimize
-; this.
-
-; FUNC-LABEL: {{^}}multiple_structs:
-; R600-NOT: MOVA_INT
-; SI-NOT: v_movrel
-; SI-NOT: v_movrel
-%struct.point = type { i32, i32 }
-
-define void @multiple_structs(i32 addrspace(1)* %out) {
-entry:
-  %a = alloca %struct.point
-  %b = alloca %struct.point
-  %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
-  %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1
-  %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
-  %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1
-  store i32 0, i32* %a.x.ptr
-  store i32 1, i32* %a.y.ptr
-  store i32 2, i32* %b.x.ptr
-  store i32 3, i32* %b.y.ptr
-  %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
-  %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
-  %a.indirect = load i32, i32* %a.indirect.ptr
-  %b.indirect = load i32, i32* %b.indirect.ptr
-  %0 = add i32 %a.indirect, %b.indirect
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; Test direct access of a private array inside a loop.  The private array
-; loads and stores should be lowered to copies, so there shouldn't be any
-; MOVA instructions.
-
-; FUNC-LABEL: {{^}}direct_loop:
-; R600-NOT: MOVA_INT
-; SI-NOT: v_movrel
-
-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %prv_array_const = alloca [2 x i32]
-  %prv_array = alloca [2 x i32]
-  %a = load i32, i32 addrspace(1)* %in
-  %b_src_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %b = load i32, i32 addrspace(1)* %b_src_ptr
-  %a_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
-  store i32 %a, i32* %a_dst_ptr
-  %b_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
-  store i32 %b, i32* %b_dst_ptr
-  br label %for.body
-
-for.body:
-  %inc = phi i32 [0, %entry], [%count, %for.body]
-  %x_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
-  %x = load i32, i32* %x_ptr
-  %y_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
-  %y = load i32, i32* %y_ptr
-  %xy = add i32 %x, %y
-  store i32 %xy, i32* %y_ptr
-  %count = add i32 %inc, 1
-  %done = icmp eq i32 %count, 4095
-  br i1 %done, label %for.end, label %for.body
-
-for.end:
-  %value_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
-  %value = load i32, i32* %value_ptr
-  store i32 %value, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}short_array:
-
-; R600: MOVA_INT
-
-; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
-; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0
-; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
-define void @short_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
-  %0 = alloca [2 x i16]
-  %1 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 0
-  %2 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 1
-  store i16 0, i16* %1
-  store i16 1, i16* %2
-  %3 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 %index
-  %4 = load i16, i16* %3
-  %5 = sext i16 %4 to i32
-  store i32 %5, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}char_array:
-
-; R600: MOVA_INT
-
-; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
-; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0
-define void @char_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
-  %0 = alloca [2 x i8]
-  %1 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 0
-  %2 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 1
-  store i8 0, i8* %1
-  store i8 1, i8* %2
-  %3 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 %index
-  %4 = load i8, i8* %3
-  %5 = sext i8 %4 to i32
-  store i32 %5, i32 addrspace(1)* %out
-  ret void
-
-}
-
-; Make sure we don't overwrite workitem information with private memory
-
-; FUNC-LABEL: {{^}}work_item_info:
-; R600-NOT: MOV T0.X
-; Additional check in case the move ends up in the last slot
-; R600-NOT: MOV * TO.X
-
-; SI-NOT: v_mov_b32_e{{(32|64)}} v0
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = alloca [2 x i32]
-  %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
-  %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1
-  store i32 0, i32* %1
-  store i32 1, i32* %2
-  %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in
-  %4 = load i32, i32* %3
-  %5 = call i32 @llvm.r600.read.tidig.x()
-  %6 = add i32 %4, %5
-  store i32 %6, i32 addrspace(1)* %out
-  ret void
-}
-
-; Test that two stack objects are not stored in the same register
-; The second stack object should be in T3.X
-; FUNC-LABEL: {{^}}no_overlap:
-; R600_CHECK: MOV
-; R600_CHECK: [[CHAN:[XYZW]]]+
-; R600-NOT: [[CHAN]]+
-; SI: v_mov_b32_e32 v3
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = alloca [3 x i8], align 1
-  %1 = alloca [2 x i8], align 1
-  %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0
-  %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1
-  %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2
-  %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0
-  %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1
-  store i8 0, i8* %2
-  store i8 1, i8* %3
-  store i8 2, i8* %4
-  store i8 1, i8* %5
-  store i8 0, i8* %6
-  %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in
-  %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in
-  %9 = load i8, i8* %7
-  %10 = load i8, i8* %8
-  %11 = add i8 %9, %10
-  %12 = sext i8 %11 to i32
-  store i32 %12, i32 addrspace(1)* %out
-  ret void
-}
-
-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
-  %alloca = alloca [2 x [2 x i8]]
-  %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
-  %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
-  store i8 0, i8* %gep0
-  store i8 1, i8* %gep1
-  %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
-  %load = load i8, i8* %gep2
-  %sext = sext i8 %load to i32
-  store i32 %sext, i32 addrspace(1)* %out
-  ret void
-}
-
-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
-  %alloca = alloca [2 x [2 x i32]]
-  %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
-  %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
-  store i32 0, i32* %gep0
-  store i32 1, i32* %gep1
-  %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
-  %load = load i32, i32* %gep2
-  store i32 %load, i32 addrspace(1)* %out
-  ret void
-}
-
-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) {
-entry:
-  %alloca = alloca [2 x [2 x i64]]
-  %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
-  %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
-  store i64 0, i64* %gep0
-  store i64 1, i64* %gep1
-  %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
-  %load = load i64, i64* %gep2
-  store i64 %load, i64 addrspace(1)* %out
-  ret void
-}
-
-%struct.pair32 = type { i32, i32 }
-
-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
-  %alloca = alloca [2 x [2 x %struct.pair32]]
-  %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
-  %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
-  store i32 0, i32* %gep0
-  store i32 1, i32* %gep1
-  %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
-  %load = load i32, i32* %gep2
-  store i32 %load, i32 addrspace(1)* %out
-  ret void
-}
-
-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
-  %alloca = alloca [2 x %struct.pair32]
-  %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
-  %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
-  store i32 0, i32* %gep0
-  store i32 1, i32* %gep1
-  %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
-  %load = load i32, i32* %gep2
-  store i32 %load, i32 addrspace(1)* %out
-  ret void
-}
-
-define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
-entry:
-  %tmp = alloca [2 x i32]
-  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
-  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
-  store i32 0, i32* %tmp1
-  store i32 1, i32* %tmp2
-  %cmp = icmp eq i32 %in, 0
-  %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
-  %load = load i32, i32* %sel
-  store i32 %load, i32 addrspace(1)* %out
-  ret void
-}
-
-; AMDGPUPromoteAlloca does not know how to handle ptrtoint.  When it
-; finds one, it should stop trying to promote.
-
-; FUNC-LABEL: ptrtoint:
-; SI-NOT: ds_write
-; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
-; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-  %alloca = alloca [16 x i32]
-  %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
-  store i32 5, i32* %tmp0
-  %tmp1 = ptrtoint [16 x i32]* %alloca to i32
-  %tmp2 = add i32 %tmp1, 5
-  %tmp3 = inttoptr i32 %tmp2 to i32*
-  %tmp4 = getelementptr i32, i32* %tmp3, i32 %b
-  %tmp5 = load i32, i32* %tmp4
-  store i32 %tmp5, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/pv-packing.ll b/test/CodeGen/R600/pv-packing.ll
deleted file mode 100644
index abeae563ff3..00000000000
--- a/test/CodeGen/R600/pv-packing.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
-
-;CHECK: DOT4  T{{[0-9]\.X}}
-;CHECK: MULADD_IEEE * T{{[0-9]\.W}}
-
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
-main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = extractelement <4 x float> %reg1, i32 2
-  %3 = extractelement <4 x float> %reg2, i32 0
-  %4 = extractelement <4 x float> %reg2, i32 1
-  %5 = extractelement <4 x float> %reg2, i32 2
-  %6 = extractelement <4 x float> %reg3, i32 0
-  %7 = extractelement <4 x float> %reg3, i32 1
-  %8 = extractelement <4 x float> %reg3, i32 2
-  %9 = load <4 x float>, <4 x float> addrspace(8)* null
-  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9)
-  %12 = fmul float %0, %3
-  %13 = fadd float %12, %6
-  %14 = fmul float %1, %4
-  %15 = fadd float %14, %7
-  %16 = fmul float %2, %5
-  %17 = fadd float %16, %8
-  %18 = fmul float %11, %11
-  %19 = fadd float %18, %0
-  %20 = insertelement <4 x float> undef, float %13, i32 0
-  %21 = insertelement <4 x float> %20, float %15, i32 1
-  %22 = insertelement <4 x float> %21, float %17, i32 2
-  %23 = insertelement <4 x float> %22, float %19, i32 3
-  %24 = call float @llvm.AMDGPU.dp4(<4 x float> %23, <4 x float> %10)
-  %25 = insertelement <4 x float> undef, float %24, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %25, i32 0, i32 2)
-  ret void
-}
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll
deleted file mode 100644
index 9a57dd19765..00000000000
--- a/test/CodeGen/R600/pv.ll
+++ /dev/null
@@ -1,241 +0,0 @@
-; RUN: llc < %s -march=r600 | FileCheck %s
-
-; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
-; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
-
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 {
-main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = extractelement <4 x float> %reg1, i32 2
-  %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = extractelement <4 x float> %reg2, i32 0
-  %5 = extractelement <4 x float> %reg2, i32 1
-  %6 = extractelement <4 x float> %reg2, i32 2
-  %7 = extractelement <4 x float> %reg2, i32 3
-  %8 = extractelement <4 x float> %reg3, i32 0
-  %9 = extractelement <4 x float> %reg3, i32 1
-  %10 = extractelement <4 x float> %reg3, i32 2
-  %11 = extractelement <4 x float> %reg3, i32 3
-  %12 = extractelement <4 x float> %reg4, i32 0
-  %13 = extractelement <4 x float> %reg4, i32 1
-  %14 = extractelement <4 x float> %reg4, i32 2
-  %15 = extractelement <4 x float> %reg4, i32 3
-  %16 = extractelement <4 x float> %reg5, i32 0
-  %17 = extractelement <4 x float> %reg5, i32 1
-  %18 = extractelement <4 x float> %reg5, i32 2
-  %19 = extractelement <4 x float> %reg5, i32 3
-  %20 = extractelement <4 x float> %reg6, i32 0
-  %21 = extractelement <4 x float> %reg6, i32 1
-  %22 = extractelement <4 x float> %reg6, i32 2
-  %23 = extractelement <4 x float> %reg6, i32 3
-  %24 = extractelement <4 x float> %reg7, i32 0
-  %25 = extractelement <4 x float> %reg7, i32 1
-  %26 = extractelement <4 x float> %reg7, i32 2
-  %27 = extractelement <4 x float> %reg7, i32 3
-  %28 = load <4 x float>, <4 x float> addrspace(8)* null
-  %29 = extractelement <4 x float> %28, i32 0
-  %30 = fmul float %0, %29
-  %31 = load <4 x float>, <4 x float> addrspace(8)* null
-  %32 = extractelement <4 x float> %31, i32 1
-  %33 = fmul float %0, %32
-  %34 = load <4 x float>, <4 x float> addrspace(8)* null
-  %35 = extractelement <4 x float> %34, i32 2
-  %36 = fmul float %0, %35
-  %37 = load <4 x float>, <4 x float> addrspace(8)* null
-  %38 = extractelement <4 x float> %37, i32 3
-  %39 = fmul float %0, %38
-  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %41 = extractelement <4 x float> %40, i32 0
-  %42 = fmul float %1, %41
-  %43 = fadd float %42, %30
-  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %45 = extractelement <4 x float> %44, i32 1
-  %46 = fmul float %1, %45
-  %47 = fadd float %46, %33
-  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %49 = extractelement <4 x float> %48, i32 2
-  %50 = fmul float %1, %49
-  %51 = fadd float %50, %36
-  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %53 = extractelement <4 x float> %52, i32 3
-  %54 = fmul float %1, %53
-  %55 = fadd float %54, %39
-  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %57 = extractelement <4 x float> %56, i32 0
-  %58 = fmul float %2, %57
-  %59 = fadd float %58, %43
-  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %61 = extractelement <4 x float> %60, i32 1
-  %62 = fmul float %2, %61
-  %63 = fadd float %62, %47
-  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %65 = extractelement <4 x float> %64, i32 2
-  %66 = fmul float %2, %65
-  %67 = fadd float %66, %51
-  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %69 = extractelement <4 x float> %68, i32 3
-  %70 = fmul float %2, %69
-  %71 = fadd float %70, %55
-  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %73 = extractelement <4 x float> %72, i32 0
-  %74 = fmul float %3, %73
-  %75 = fadd float %74, %59
-  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %77 = extractelement <4 x float> %76, i32 1
-  %78 = fmul float %3, %77
-  %79 = fadd float %78, %63
-  %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %81 = extractelement <4 x float> %80, i32 2
-  %82 = fmul float %3, %81
-  %83 = fadd float %82, %67
-  %84 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %85 = extractelement <4 x float> %84, i32 3
-  %86 = fmul float %3, %85
-  %87 = fadd float %86, %71
-  %88 = insertelement <4 x float> undef, float %4, i32 0
-  %89 = insertelement <4 x float> %88, float %5, i32 1
-  %90 = insertelement <4 x float> %89, float %6, i32 2
-  %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 3
-  %92 = insertelement <4 x float> undef, float %4, i32 0
-  %93 = insertelement <4 x float> %92, float %5, i32 1
-  %94 = insertelement <4 x float> %93, float %6, i32 2
-  %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3
-  %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95)
-  %97 = call float @fabs(float %96)
-  %98 = call float @llvm.AMDGPU.rsq.f32(float %97)
-  %99 = fmul float %4, %98
-  %100 = fmul float %5, %98
-  %101 = fmul float %6, %98
-  %102 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %103 = extractelement <4 x float> %102, i32 0
-  %104 = fmul float %103, %8
-  %105 = fadd float %104, %20
-  %106 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %107 = extractelement <4 x float> %106, i32 1
-  %108 = fmul float %107, %9
-  %109 = fadd float %108, %21
-  %110 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %111 = extractelement <4 x float> %110, i32 2
-  %112 = fmul float %111, %10
-  %113 = fadd float %112, %22
-  %114 = call float @llvm.AMDIL.clamp.(float %105, float 0.000000e+00, float 1.000000e+00)
-  %115 = call float @llvm.AMDIL.clamp.(float %109, float 0.000000e+00, float 1.000000e+00)
-  %116 = call float @llvm.AMDIL.clamp.(float %113, float 0.000000e+00, float 1.000000e+00)
-  %117 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
-  %118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %119 = extractelement <4 x float> %118, i32 0
-  %120 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %121 = extractelement <4 x float> %120, i32 1
-  %122 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %123 = extractelement <4 x float> %122, i32 2
-  %124 = insertelement <4 x float> undef, float %99, i32 0
-  %125 = insertelement <4 x float> %124, float %100, i32 1
-  %126 = insertelement <4 x float> %125, float %101, i32 2
-  %127 = insertelement <4 x float> %126, float 0.000000e+00, i32 3
-  %128 = insertelement <4 x float> undef, float %119, i32 0
-  %129 = insertelement <4 x float> %128, float %121, i32 1
-  %130 = insertelement <4 x float> %129, float %123, i32 2
-  %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3
-  %132 = call float @llvm.AMDGPU.dp4(<4 x float> %127, <4 x float> %131)
-  %133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %134 = extractelement <4 x float> %133, i32 0
-  %135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %136 = extractelement <4 x float> %135, i32 1
-  %137 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %138 = extractelement <4 x float> %137, i32 2
-  %139 = insertelement <4 x float> undef, float %99, i32 0
-  %140 = insertelement <4 x float> %139, float %100, i32 1
-  %141 = insertelement <4 x float> %140, float %101, i32 2
-  %142 = insertelement <4 x float> %141, float 0.000000e+00, i32 3
-  %143 = insertelement <4 x float> undef, float %134, i32 0
-  %144 = insertelement <4 x float> %143, float %136, i32 1
-  %145 = insertelement <4 x float> %144, float %138, i32 2
-  %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3
-  %147 = call float @llvm.AMDGPU.dp4(<4 x float> %142, <4 x float> %146)
-  %148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %149 = extractelement <4 x float> %148, i32 0
-  %150 = fmul float %149, %8
-  %151 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %152 = extractelement <4 x float> %151, i32 1
-  %153 = fmul float %152, %9
-  %154 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %155 = extractelement <4 x float> %154, i32 2
-  %156 = fmul float %155, %10
-  %157 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %158 = extractelement <4 x float> %157, i32 0
-  %159 = fmul float %158, %12
-  %160 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %161 = extractelement <4 x float> %160, i32 1
-  %162 = fmul float %161, %13
-  %163 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %164 = extractelement <4 x float> %163, i32 2
-  %165 = fmul float %164, %14
-  %166 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %167 = extractelement <4 x float> %166, i32 0
-  %168 = fmul float %167, %16
-  %169 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %170 = extractelement <4 x float> %169, i32 1
-  %171 = fmul float %170, %17
-  %172 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %173 = extractelement <4 x float> %172, i32 2
-  %174 = fmul float %173, %18
-  %175 = fcmp uge float %132, 0.000000e+00
-  %176 = select i1 %175, float %132, float 0.000000e+00
-  %177 = fcmp uge float %147, 0.000000e+00
-  %178 = select i1 %177, float %147, float 0.000000e+00
-  %179 = call float @llvm.pow.f32(float %178, float %24)
-  %180 = fcmp ult float %132, 0.000000e+00
-  %181 = select i1 %180, float 0.000000e+00, float %179
-  %182 = fadd float %150, %105
-  %183 = fadd float %153, %109
-  %184 = fadd float %156, %113
-  %185 = fmul float %176, %159
-  %186 = fadd float %185, %182
-  %187 = fmul float %176, %162
-  %188 = fadd float %187, %183
-  %189 = fmul float %176, %165
-  %190 = fadd float %189, %184
-  %191 = fmul float %181, %168
-  %192 = fadd float %191, %186
-  %193 = fmul float %181, %171
-  %194 = fadd float %193, %188
-  %195 = fmul float %181, %174
-  %196 = fadd float %195, %190
-  %197 = call float @llvm.AMDIL.clamp.(float %192, float 0.000000e+00, float 1.000000e+00)
-  %198 = call float @llvm.AMDIL.clamp.(float %194, float 0.000000e+00, float 1.000000e+00)
-  %199 = call float @llvm.AMDIL.clamp.(float %196, float 0.000000e+00, float 1.000000e+00)
-  %200 = insertelement <4 x float> undef, float %75, i32 0
-  %201 = insertelement <4 x float> %200, float %79, i32 1
-  %202 = insertelement <4 x float> %201, float %83, i32 2
-  %203 = insertelement <4 x float> %202, float %87, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %203, i32 60, i32 1)
-  %204 = insertelement <4 x float> undef, float %197, i32 0
-  %205 = insertelement <4 x float> %204, float %198, i32 1
-  %206 = insertelement <4 x float> %205, float %199, i32 2
-  %207 = insertelement <4 x float> %206, float %117, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %207, i32 0, i32 2)
-  ret void
-}
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-
-; Function Attrs: readonly
-declare float @fabs(float) #2
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq.f32(float) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDIL.clamp.(float, float, float) #1
-
-; Function Attrs: nounwind readonly
-declare float @llvm.pow.f32(float, float) #3
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
-attributes #2 = { readonly }
-attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/R600/r600-encoding.ll b/test/CodeGen/R600/r600-encoding.ll
deleted file mode 100644
index 3a82ee30a32..00000000000
--- a/test/CodeGen/R600/r600-encoding.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=redwood | FileCheck --check-prefix=EG %s
-; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rs880 | FileCheck --check-prefix=R600 %s
-
-; The earliest R600 GPUs have a slightly different encoding than the rest of
-; the VLIW4/5 GPUs.
-
-; EG: {{^}}test:
-; EG: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}]
-
-; R600: {{^}}test:
-; R600: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}]
-
-define void @test(<4 x float> inreg %reg0) #0 {
-entry:
-  %r0 = extractelement <4 x float> %reg0, i32 0
-  %r1 = extractelement <4 x float> %reg0, i32 1
-  %r2 = fmul float %r0, %r1
-  %vec = insertelement <4 x float> undef, float %r2, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
-  ret void
-}
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/r600-export-fix.ll b/test/CodeGen/R600/r600-export-fix.ll
deleted file mode 100644
index 7cb80195b36..00000000000
--- a/test/CodeGen/R600/r600-export-fix.ll
+++ /dev/null
@@ -1,142 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s
-
-;CHECK:	EXPORT T{{[0-9]}}.XYZW
-;CHECK:	EXPORT T{{[0-9]}}.0000
-;CHECK: EXPORT T{{[0-9]}}.0000
-;CHECK: EXPORT T{{[0-9]}}.0XYZ
-;CHECK: EXPORT T{{[0-9]}}.XYZW
-;CHECK: EXPORT T{{[0-9]}}.YZ00
-;CHECK: EXPORT T{{[0-9]}}.0000
-;CHECK: EXPORT T{{[0-9]}}.0000
-
-
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
-main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = extractelement <4 x float> %reg1, i32 2
-  %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %5 = extractelement <4 x float> %4, i32 0
-  %6 = fmul float %5, %0
-  %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %8 = extractelement <4 x float> %7, i32 1
-  %9 = fmul float %8, %0
-  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %11 = extractelement <4 x float> %10, i32 2
-  %12 = fmul float %11, %0
-  %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %14 = extractelement <4 x float> %13, i32 3
-  %15 = fmul float %14, %0
-  %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %17 = extractelement <4 x float> %16, i32 0
-  %18 = fmul float %17, %1
-  %19 = fadd float %18, %6
-  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %21 = extractelement <4 x float> %20, i32 1
-  %22 = fmul float %21, %1
-  %23 = fadd float %22, %9
-  %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %25 = extractelement <4 x float> %24, i32 2
-  %26 = fmul float %25, %1
-  %27 = fadd float %26, %12
-  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %29 = extractelement <4 x float> %28, i32 3
-  %30 = fmul float %29, %1
-  %31 = fadd float %30, %15
-  %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %33 = extractelement <4 x float> %32, i32 0
-  %34 = fmul float %33, %2
-  %35 = fadd float %34, %19
-  %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %37 = extractelement <4 x float> %36, i32 1
-  %38 = fmul float %37, %2
-  %39 = fadd float %38, %23
-  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %41 = extractelement <4 x float> %40, i32 2
-  %42 = fmul float %41, %2
-  %43 = fadd float %42, %27
-  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %45 = extractelement <4 x float> %44, i32 3
-  %46 = fmul float %45, %2
-  %47 = fadd float %46, %31
-  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %49 = extractelement <4 x float> %48, i32 0
-  %50 = fmul float %49, %3
-  %51 = fadd float %50, %35
-  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %53 = extractelement <4 x float> %52, i32 1
-  %54 = fmul float %53, %3
-  %55 = fadd float %54, %39
-  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %57 = extractelement <4 x float> %56, i32 2
-  %58 = fmul float %57, %3
-  %59 = fadd float %58, %43
-  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %61 = extractelement <4 x float> %60, i32 3
-  %62 = fmul float %61, %3
-  %63 = fadd float %62, %47
-  %64 = load <4 x float>, <4 x float> addrspace(8)* null
-  %65 = extractelement <4 x float> %64, i32 0
-  %66 = load <4 x float>, <4 x float> addrspace(8)* null
-  %67 = extractelement <4 x float> %66, i32 1
-  %68 = load <4 x float>, <4 x float> addrspace(8)* null
-  %69 = extractelement <4 x float> %68, i32 2
-  %70 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %71 = extractelement <4 x float> %70, i32 0
-  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %73 = extractelement <4 x float> %72, i32 1
-  %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %75 = extractelement <4 x float> %74, i32 2
-  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %77 = extractelement <4 x float> %76, i32 0
-  %78 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %79 = extractelement <4 x float> %78, i32 1
-  %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %81 = extractelement <4 x float> %80, i32 2
-  %82 = insertelement <4 x float> undef, float %51, i32 0
-  %83 = insertelement <4 x float> %82, float %55, i32 1
-  %84 = insertelement <4 x float> %83, float %59, i32 2
-  %85 = insertelement <4 x float> %84, float %63, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %85, i32 60, i32 1)
-  %86 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
-  %87 = insertelement <4 x float> %86, float 0.000000e+00, i32 1
-  %88 = insertelement <4 x float> %87, float 0.000000e+00, i32 2
-  %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %89, i32 0, i32 2)
-  %90 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
-  %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 1
-  %92 = insertelement <4 x float> %91, float 0.000000e+00, i32 2
-  %93 = insertelement <4 x float> %92, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %93, i32 1, i32 2)
-  %94 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
-  %95 = insertelement <4 x float> %94, float %65, i32 1
-  %96 = insertelement <4 x float> %95, float %67, i32 2
-  %97 = insertelement <4 x float> %96, float %69, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %97, i32 2, i32 2)
-  %98 = insertelement <4 x float> undef, float %77, i32 0
-  %99 = insertelement <4 x float> %98, float %79, i32 1
-  %100 = insertelement <4 x float> %99, float %81, i32 2
-  %101 = insertelement <4 x float> %100, float %71, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %101, i32 3, i32 2)
-  %102 = insertelement <4 x float> undef, float %73, i32 0
-  %103 = insertelement <4 x float> %102, float %75, i32 1
-  %104 = insertelement <4 x float> %103, float 0.000000e+00, i32 2
-  %105 = insertelement <4 x float> %104, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %105, i32 4, i32 2)
-  %106 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
-  %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 1
-  %108 = insertelement <4 x float> %107, float 0.000000e+00, i32 2
-  %109 = insertelement <4 x float> %108, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %109, i32 5, i32 2)
-  %110 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
-  %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 1
-  %112 = insertelement <4 x float> %111, float 0.000000e+00, i32 2
-  %113 = insertelement <4 x float> %112, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %113, i32 6, i32 2)
-  ret void
-}
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll b/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll
deleted file mode 100644
index f388f8ffe29..00000000000
--- a/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman
-
-define void @main(<4 x float> inreg, <4 x float> inreg) #0 {
-main_body:
-  %2 = extractelement <4 x float> %0, i32 0
-  %3 = extractelement <4 x float> %0, i32 1
-  %4 = extractelement <4 x float> %0, i32 2
-  %5 = extractelement <4 x float> %0, i32 3
-  %6 = insertelement <4 x float> undef, float %2, i32 0
-  %7 = insertelement <4 x float> %6, float %3, i32 1
-  %8 = insertelement <4 x float> %7, float %4, i32 2
-  %9 = insertelement <4 x float> %8, float %5, i32 3
-  %10 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %9)
-  %11 = extractelement <4 x float> %10, i32 0
-  %12 = extractelement <4 x float> %10, i32 1
-  %13 = extractelement <4 x float> %10, i32 2
-  %14 = extractelement <4 x float> %10, i32 3
-  %15 = call float @fabs(float %13)
-  %16 = fdiv float 1.000000e+00, %15
-  %17 = fmul float %11, %16
-  %18 = fadd float %17, 1.500000e+00
-  %19 = fmul float %12, %16
-  %20 = fadd float %19, 1.500000e+00
-  %21 = insertelement <4 x float> undef, float %20, i32 0
-  %22 = insertelement <4 x float> %21, float %18, i32 1
-  %23 = insertelement <4 x float> %22, float %14, i32 2
-  %24 = insertelement <4 x float> %23, float %5, i32 3
-  %25 = extractelement <4 x float> %24, i32 0
-  %26 = extractelement <4 x float> %24, i32 1
-  %27 = extractelement <4 x float> %24, i32 2
-  %28 = extractelement <4 x float> %24, i32 3
-  %29 = insertelement <4 x float> undef, float %25, i32 0
-  %30 = insertelement <4 x float> %29, float %26, i32 1
-  %31 = insertelement <4 x float> %30, float %27, i32 2
-  %32 = insertelement <4 x float> %31, float %28, i32 3
-  %33 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %32, i32 16, i32 0, i32 13)
-  %34 = extractelement <4 x float> %33, i32 0
-  %35 = insertelement <4 x float> undef, float %34, i32 0
-  %36 = insertelement <4 x float> %35, float %34, i32 1
-  %37 = insertelement <4 x float> %36, float %34, i32 2
-  %38 = insertelement <4 x float> %37, float 1.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %38, i32 0, i32 0)
-  ret void
-}
-
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
-
-; Function Attrs: readnone
-declare float @fabs(float) #1
-
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/r600cfg.ll b/test/CodeGen/R600/r600cfg.ll
deleted file mode 100644
index c7b9d65220f..00000000000
--- a/test/CodeGen/R600/r600cfg.ll
+++ /dev/null
@@ -1,119 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood
-
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
-main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = extractelement <4 x float> %reg1, i32 2
-  %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = bitcast float %0 to i32
-  %5 = icmp eq i32 %4, 0
-  %6 = sext i1 %5 to i32
-  %7 = bitcast i32 %6 to float
-  %8 = bitcast float %7 to i32
-  %9 = icmp ne i32 %8, 0
-  %. = select i1 %9, float 0x36A0000000000000, float %0
-  br label %LOOP
-
-LOOP:                                             ; preds = %LOOP47, %main_body
-  %temp12.0 = phi float [ 0x36A0000000000000, %main_body ], [ %temp12.1, %LOOP47 ]
-  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %38, %LOOP47 ]
-  %temp4.1 = phi float [ %., %main_body ], [ %52, %LOOP47 ]
-  %10 = bitcast float %temp4.1 to i32
-  %11 = icmp eq i32 %10, 1
-  %12 = sext i1 %11 to i32
-  %13 = bitcast i32 %12 to float
-  %14 = bitcast float %13 to i32
-  %15 = icmp ne i32 %14, 0
-  br i1 %15, label %IF41, label %ENDIF40
-
-IF41:                                             ; preds = %LOOP
-  %16 = insertelement <4 x float> undef, float %0, i32 0
-  %17 = insertelement <4 x float> %16, float %temp8.0, i32 1
-  %18 = insertelement <4 x float> %17, float %temp12.0, i32 2
-  %19 = insertelement <4 x float> %18, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.stream.output(<4 x float> %19, i32 0, i32 0, i32 1)
-  %20 = insertelement <4 x float> undef, float %0, i32 0
-  %21 = insertelement <4 x float> %20, float %temp8.0, i32 1
-  %22 = insertelement <4 x float> %21, float %temp12.0, i32 2
-  %23 = insertelement <4 x float> %22, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.stream.output(<4 x float> %23, i32 0, i32 0, i32 2)
-  %24 = insertelement <4 x float> undef, float %0, i32 0
-  %25 = insertelement <4 x float> %24, float %temp8.0, i32 1
-  %26 = insertelement <4 x float> %25, float %temp12.0, i32 2
-  %27 = insertelement <4 x float> %26, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.stream.output(<4 x float> %27, i32 0, i32 0, i32 4)
-  %28 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
-  %29 = insertelement <4 x float> %28, float 0.000000e+00, i32 1
-  %30 = insertelement <4 x float> %29, float 0.000000e+00, i32 2
-  %31 = insertelement <4 x float> %30, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %31, i32 60, i32 1)
-  %32 = insertelement <4 x float> undef, float %0, i32 0
-  %33 = insertelement <4 x float> %32, float %temp8.0, i32 1
-  %34 = insertelement <4 x float> %33, float %temp12.0, i32 2
-  %35 = insertelement <4 x float> %34, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %35, i32 0, i32 2)
-  ret void
-
-ENDIF40:                                          ; preds = %LOOP
-  %36 = bitcast float %temp8.0 to i32
-  %37 = add i32 %36, 1
-  %38 = bitcast i32 %37 to float
-  %39 = bitcast float %temp4.1 to i32
-  %40 = urem i32 %39, 2
-  %41 = bitcast i32 %40 to float
-  %42 = bitcast float %41 to i32
-  %43 = icmp eq i32 %42, 0
-  %44 = sext i1 %43 to i32
-  %45 = bitcast i32 %44 to float
-  %46 = bitcast float %45 to i32
-  %47 = icmp ne i32 %46, 0
-  %48 = bitcast float %temp4.1 to i32
-  br i1 %47, label %IF44, label %ELSE45
-
-IF44:                                             ; preds = %ENDIF40
-  %49 = udiv i32 %48, 2
-  br label %ENDIF43
-
-ELSE45:                                           ; preds = %ENDIF40
-  %50 = mul i32 3, %48
-  %51 = add i32 %50, 1
-  br label %ENDIF43
-
-ENDIF43:                                          ; preds = %ELSE45, %IF44
-  %.sink = phi i32 [ %49, %IF44 ], [ %51, %ELSE45 ]
-  %52 = bitcast i32 %.sink to float
-  %53 = load <4 x float>, <4 x float> addrspace(8)* null
-  %54 = extractelement <4 x float> %53, i32 0
-  %55 = bitcast float %54 to i32
-  br label %LOOP47
-
-LOOP47:                                           ; preds = %ENDIF48, %ENDIF43
-  %temp12.1 = phi float [ %temp12.0, %ENDIF43 ], [ %67, %ENDIF48 ]
-  %temp28.0 = phi float [ 0.000000e+00, %ENDIF43 ], [ %70, %ENDIF48 ]
-  %56 = bitcast float %temp28.0 to i32
-  %57 = icmp uge i32 %56, %55
-  %58 = sext i1 %57 to i32
-  %59 = bitcast i32 %58 to float
-  %60 = bitcast float %59 to i32
-  %61 = icmp ne i32 %60, 0
-  br i1 %61, label %LOOP, label %ENDIF48
-
-ENDIF48:                                          ; preds = %LOOP47
-  %62 = bitcast float %temp12.1 to i32
-  %63 = mul i32 %62, 2
-  %64 = bitcast i32 %63 to float
-  %65 = bitcast float %64 to i32
-  %66 = urem i32 %65, 2147483647
-  %67 = bitcast i32 %66 to float
-  %68 = bitcast float %temp28.0 to i32
-  %69 = add i32 %68, 1
-  %70 = bitcast i32 %69 to float
-  br label %LOOP47
-}
-
-declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32)
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/reciprocal.ll b/test/CodeGen/R600/reciprocal.ll
deleted file mode 100644
index b4ac47afced..00000000000
--- a/test/CodeGen/R600/reciprocal.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @test(<4 x float> inreg %reg0) #0  {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = fdiv float 1.0, %r0
-   %vec = insertelement <4 x float> undef, float %r1, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
-   ret void
-}
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/register-count-comments.ll b/test/CodeGen/R600/register-count-comments.ll
deleted file mode 100644
index de6bfb31088..00000000000
--- a/test/CodeGen/R600/register-count-comments.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
-
-declare i32 @llvm.SI.tid() nounwind readnone
-
-; SI-LABEL: {{^}}foo:
-; SI: .section	.AMDGPU.csdata
-; SI: ; Kernel info:
-; SI: ; NumSgprs: {{[0-9]+}}
-; SI: ; NumVgprs: {{[0-9]+}}
-define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
-  %tid = call i32 @llvm.SI.tid() nounwind readnone
-  %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid
-  %bptr = getelementptr i32, i32 addrspace(1)* %bbase, i32 %tid
-  %outptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
-  %result = add i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %outptr, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}one_vgpr_used:
-; SI: NumVgprs: 1
-define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind {
-  store i32 %x, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/reorder-stores.ll b/test/CodeGen/R600/reorder-stores.ll
deleted file mode 100644
index 187650ff9a5..00000000000
--- a/test/CodeGen/R600/reorder-stores.ll
+++ /dev/null
@@ -1,105 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store:
-; SI: buffer_load_dwordx2
-; SI: buffer_load_dwordx2
-; SI: buffer_load_dwordx2
-; SI: buffer_load_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
-  %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16
-  %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16
-  store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16
-  store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store:
-; SI: ds_read_b64
-; SI: ds_read_b64
-; SI: ds_write_b64
-; SI: ds_write_b64
-; SI: s_endpgm
-define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
-  %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16
-  %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16
-  store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16
-  store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
-  %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32
-  %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32
-  store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32
-  store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32
-  ret void
-}
-
-; SI-LABEL: {{^}}no_reorder_extload_64:
-; SI: ds_read_b64
-; SI: ds_read_b64
-; SI: ds_write_b64
-; SI-NOT: ds_read
-; SI: ds_write_b64
-; SI: s_endpgm
-define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
-  %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8
-  %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8
-  %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64>
-  %tmp4ext = zext <2 x i32> %tmp4 to <2 x i64>
-  %tmp7 = add <2 x i64> %tmp1ext, <i64 1, i64 1>
-  %tmp9 = add <2 x i64> %tmp4ext, <i64 1, i64 1>
-  %trunctmp9 = trunc <2 x i64> %tmp9 to <2 x i32>
-  %trunctmp7 = trunc <2 x i64> %tmp7 to <2 x i32>
-  store <2 x i32> %trunctmp9, <2 x i32> addrspace(3)* %x, align 8
-  store <2 x i32> %trunctmp7, <2 x i32> addrspace(3)* %y, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/rotl.i64.ll b/test/CodeGen/R600/rotl.i64.ll
deleted file mode 100644
index 3f4ceb7e031..00000000000
--- a/test/CodeGen/R600/rotl.i64.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
-
-; BOTH-LABEL: {{^}}s_rotl_i64:
-; BOTH-DAG: s_lshl_b64
-; BOTH-DAG: s_sub_i32
-; BOTH-DAG: s_lshr_b64
-; BOTH: s_or_b64
-; BOTH: s_endpgm
-define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
-entry:
-  %0 = shl i64 %x, %y
-  %1 = sub i64 64, %y
-  %2 = lshr i64 %x, %1
-  %3 = or i64 %0, %2
-  store i64 %3, i64 addrspace(1)* %in
-  ret void
-}
-
-; BOTH-LABEL: {{^}}v_rotl_i64:
-; SI-DAG: v_lshl_b64
-; VI-DAG: v_lshlrev_b64
-; BOTH-DAG: v_sub_i32
-; SI: v_lshr_b64
-; VI: v_lshrrev_b64
-; BOTH: v_or_b32
-; BOTH: v_or_b32
-; BOTH: s_endpgm
-define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
-entry:
-  %x = load i64, i64 addrspace(1)* %xptr, align 8
-  %y = load i64, i64 addrspace(1)* %yptr, align 8
-  %tmp0 = shl i64 %x, %y
-  %tmp1 = sub i64 64, %y
-  %tmp2 = lshr i64 %x, %tmp1
-  %tmp3 = or i64 %tmp0, %tmp2
-  store i64 %tmp3, i64 addrspace(1)* %in, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/rotl.ll b/test/CodeGen/R600/rotl.ll
deleted file mode 100644
index 6c144cd56ea..00000000000
--- a/test/CodeGen/R600/rotl.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}rotl_i32:
-; R600: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
-; R600-NEXT: 32
-; R600: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}}
-
-; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
-; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]]
-; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]]
-define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
-entry:
-  %0 = shl i32 %x, %y
-  %1 = sub i32 32, %y
-  %2 = lshr i32 %x, %1
-  %3 = or i32 %0, %2
-  store i32 %3, i32 addrspace(1)* %in
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rotl_v2i32:
-; SI-DAG: s_sub_i32
-; SI-DAG: s_sub_i32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: v_alignbit_b32
-; SI: s_endpgm
-define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
-entry:
-  %0 = shl <2 x i32> %x, %y
-  %1 = sub <2 x i32> <i32 32, i32 32>, %y
-  %2 = lshr <2 x i32> %x, %1
-  %3 = or <2 x i32> %0, %2
-  store <2 x i32> %3, <2 x i32> addrspace(1)* %in
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rotl_v4i32:
-; SI-DAG: s_sub_i32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: s_sub_i32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: s_sub_i32
-; SI-DAG: v_alignbit_b32
-; SI-DAG: s_sub_i32
-; SI-DAG: v_alignbit_b32
-; SI: s_endpgm
-define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
-entry:
-  %0 = shl <4 x i32> %x, %y
-  %1 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
-  %2 = lshr <4 x i32> %x, %1
-  %3 = or <4 x i32> %0, %2
-  store <4 x i32> %3, <4 x i32> addrspace(1)* %in
-  ret void
-}
diff --git a/test/CodeGen/R600/rotr.i64.ll b/test/CodeGen/R600/rotr.i64.ll
deleted file mode 100644
index 586de44a566..00000000000
--- a/test/CodeGen/R600/rotr.i64.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
-
-; BOTH-LABEL: {{^}}s_rotr_i64:
-; BOTH-DAG: s_sub_i32
-; BOTH-DAG: s_lshr_b64
-; BOTH-DAG: s_lshl_b64
-; BOTH: s_or_b64
-define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
-entry:
-  %tmp0 = sub i64 64, %y
-  %tmp1 = shl i64 %x, %tmp0
-  %tmp2 = lshr i64 %x, %y
-  %tmp3 = or i64 %tmp1, %tmp2
-  store i64 %tmp3, i64 addrspace(1)* %in
-  ret void
-}
-
-; BOTH-LABEL: {{^}}v_rotr_i64:
-; BOTH-DAG: v_sub_i32
-; SI-DAG: v_lshr_b64
-; SI-DAG: v_lshl_b64
-; VI-DAG: v_lshrrev_b64
-; VI-DAG: v_lshlrev_b64
-; BOTH: v_or_b32
-; BOTH: v_or_b32
-define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
-entry:
-  %x = load i64, i64 addrspace(1)* %xptr, align 8
-  %y = load i64, i64 addrspace(1)* %yptr, align 8
-  %tmp0 = sub i64 64, %y
-  %tmp1 = shl i64 %x, %tmp0
-  %tmp2 = lshr i64 %x, %y
-  %tmp3 = or i64 %tmp1, %tmp2
-  store i64 %tmp3, i64 addrspace(1)* %in
-  ret void
-}
-
-; BOTH-LABEL: {{^}}s_rotr_v2i64:
-define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
-entry:
-  %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
-  %tmp1 = shl <2 x i64> %x, %tmp0
-  %tmp2 = lshr <2 x i64> %x, %y
-  %tmp3 = or <2 x i64> %tmp1, %tmp2
-  store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in
-  ret void
-}
-
-; BOTH-LABEL: {{^}}v_rotr_v2i64:
-define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
-entry:
-  %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8
-  %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8
-  %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
-  %tmp1 = shl <2 x i64> %x, %tmp0
-  %tmp2 = lshr <2 x i64> %x, %y
-  %tmp3 = or <2 x i64> %tmp1, %tmp2
-  store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in
-  ret void
-}
diff --git a/test/CodeGen/R600/rotr.ll b/test/CodeGen/R600/rotr.ll
deleted file mode 100644
index 044f9ffe6d6..00000000000
--- a/test/CodeGen/R600/rotr.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}rotr_i32:
-; R600: BIT_ALIGN_INT
-
-; SI: v_alignbit_b32
-define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
-entry:
-  %tmp0 = sub i32 32, %y
-  %tmp1 = shl i32 %x, %tmp0
-  %tmp2 = lshr i32 %x, %y
-  %tmp3 = or i32 %tmp1, %tmp2
-  store i32 %tmp3, i32 addrspace(1)* %in
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rotr_v2i32:
-; R600: BIT_ALIGN_INT
-; R600: BIT_ALIGN_INT
-
-; SI: v_alignbit_b32
-; SI: v_alignbit_b32
-define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
-entry:
-  %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
-  %tmp1 = shl <2 x i32> %x, %tmp0
-  %tmp2 = lshr <2 x i32> %x, %y
-  %tmp3 = or <2 x i32> %tmp1, %tmp2
-  store <2 x i32> %tmp3, <2 x i32> addrspace(1)* %in
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rotr_v4i32:
-; R600: BIT_ALIGN_INT
-; R600: BIT_ALIGN_INT
-; R600: BIT_ALIGN_INT
-; R600: BIT_ALIGN_INT
-
-; SI: v_alignbit_b32
-; SI: v_alignbit_b32
-; SI: v_alignbit_b32
-; SI: v_alignbit_b32
-define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
-entry:
-  %tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
-  %tmp1 = shl <4 x i32> %x, %tmp0
-  %tmp2 = lshr <4 x i32> %x, %y
-  %tmp3 = or <4 x i32> %tmp1, %tmp2
-  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %in
-  ret void
-}
diff --git a/test/CodeGen/R600/rsq.ll b/test/CodeGen/R600/rsq.ll
deleted file mode 100644
index b67b800c737..00000000000
--- a/test/CodeGen/R600/rsq.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-declare float @llvm.sqrt.f32(float) nounwind readnone
-declare double @llvm.sqrt.f64(double) nounwind readnone
-
-; SI-LABEL: {{^}}rsq_f32:
-; SI: v_rsq_f32_e32
-; SI: s_endpgm
-define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %val = load float, float addrspace(1)* %in, align 4
-  %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
-  %div = fdiv float 1.0, %sqrt
-  store float %div, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}rsq_f64:
-; SI-UNSAFE: v_rsq_f64_e32
-; SI-SAFE: v_sqrt_f64_e32
-; SI: s_endpgm
-define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
-  %val = load double, double addrspace(1)* %in, align 4
-  %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone
-  %div = fdiv double 1.0, %sqrt
-  store double %div, double addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}rsq_f32_sgpr:
-; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-; SI: s_endpgm
-define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
-  %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
-  %div = fdiv float 1.0, %sqrt
-  store float %div, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; Recognize that this is rsqrt(a) * rcp(b) * c,
-; not 1 / ( 1 / sqrt(a)) * rcp(b) * c.
-
-; SI-LABEL: @rsqrt_fmul
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-
-; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]]
-; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]]
-; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]]
-; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
-; SI-UNSAFE: buffer_store_dword [[RESULT]]
-
-; SI-SAFE-NOT: v_rsq_f32
-
-; SI: s_endpgm
-define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-
-  %x = call float @llvm.sqrt.f32(float %a)
-  %y = fmul float %x, %b
-  %z = fdiv float %c, %y
-  store float %z, float addrspace(1)* %out.gep
-  ret void
-}
diff --git a/test/CodeGen/R600/rv7x0_count3.ll b/test/CodeGen/R600/rv7x0_count3.ll
deleted file mode 100644
index c3fd923e459..00000000000
--- a/test/CodeGen/R600/rv7x0_count3.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc < %s -march=r600 -show-mc-encoding  -mcpu=rv710 | FileCheck %s
-
-; CHECK: TEX 9 @6 ;  encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80]
-
-define void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
-   %1 = extractelement <4 x float> %reg1, i32 0
-   %2 = extractelement <4 x float> %reg1, i32 1
-   %3 = extractelement <4 x float> %reg1, i32 2
-   %4 = extractelement <4 x float> %reg1, i32 3
-   %5 = insertelement <4 x float> undef, float %1, i32 0
-   %6 = insertelement <4 x float> %5, float %2, i32 1
-   %7 = insertelement <4 x float> %6, float %3, i32 2
-   %8 = insertelement <4 x float> %7, float %4, i32 3
-   %9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
-   %10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 1, i32 0, i32 1)
-   %11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 2, i32 0, i32 1)
-   %12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 3, i32 0, i32 1)
-   %13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 4, i32 0, i32 1)
-   %14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 5, i32 0, i32 1)
-   %15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 6, i32 0, i32 1)
-   %16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 7, i32 0, i32 1)
-   %17 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 8, i32 0, i32 1)
-   %18 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 9, i32 0, i32 1)
-   %19 = fadd <4 x float> %9, %10
-   %20 = fadd <4 x float> %19, %11
-   %21 = fadd <4 x float> %20, %12
-   %22 = fadd <4 x float> %21, %13
-   %23 = fadd <4 x float> %22, %14
-   %24 = fadd <4 x float> %23, %15
-   %25 = fadd <4 x float> %24, %16
-   %26 = fadd <4 x float> %25, %17
-   %27 = fadd <4 x float> %26, %18
-   call void @llvm.R600.store.swizzle(<4 x float> %27, i32 0, i32 2)
-   ret void
-}
-
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/s_movk_i32.ll b/test/CodeGen/R600/s_movk_i32.ll
deleted file mode 100644
index 6b1a36c979c..00000000000
--- a/test/CodeGen/R600/s_movk_i32.ll
+++ /dev/null
@@ -1,185 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}s_movk_i32_k0:
-; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: s_endpgm
-define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
-  %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32)
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_movk_i32_k1:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: s_endpgm
-define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
-  %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32)
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_movk_i32_k2:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 64{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: s_endpgm
-define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
-  %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32)
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_movk_i32_k3:
-; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: s_endpgm
-define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
-  %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32)
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_movk_i32_k4:
-; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: s_endpgm
-define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
-  %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32)
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_movk_i32_k5:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0xffef{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0xff00ffff{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: s_endpgm
-define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
-  %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_movk_i32_k6:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 63{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: s_endpgm
-define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
-  %or = or i64 %loada, 270582939713 ; 65 | (63 << 32)
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_movk_i32_k7:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x2000{{$}}
-; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x4000{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: s_endpgm
-define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
-  %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32)
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-
-; SI-LABEL: {{^}}s_movk_i32_k8:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: s_endpgm
-define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
-  %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_movk_i32_k9:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8001{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: s_endpgm
-define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
-  %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_movk_i32_k10:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8888{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: s_endpgm
-define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
-  %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_movk_i32_k11:
-; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8fff{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: s_endpgm
-define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
-  %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_movk_i32_k12:
-; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff7001{{$}}
-; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: s_endpgm
-define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
-  %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/saddo.ll b/test/CodeGen/R600/saddo.ll
deleted file mode 100644
index f8ced7942a6..00000000000
--- a/test/CodeGen/R600/saddo.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
-
-declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
-declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
-
-; FUNC-LABEL: {{^}}saddo_i64_zext:
-define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
-  %val = extractvalue { i64, i1 } %sadd, 0
-  %carry = extractvalue { i64, i1 } %sadd, 1
-  %ext = zext i1 %carry to i64
-  %add2 = add i64 %val, %ext
-  store i64 %add2, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_saddo_i32:
-define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
-  %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
-  %val = extractvalue { i32, i1 } %sadd, 0
-  %carry = extractvalue { i32, i1 } %sadd, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_saddo_i32:
-define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
-  %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
-  %val = extractvalue { i32, i1 } %sadd, 0
-  %carry = extractvalue { i32, i1 } %sadd, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_saddo_i64:
-define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
-  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
-  %val = extractvalue { i64, i1 } %sadd, 0
-  %carry = extractvalue { i64, i1 } %sadd, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_saddo_i64:
-; SI: v_add_i32
-; SI: v_addc_u32
-define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %a = load i64, i64 addrspace(1)* %aptr, align 4
-  %b = load i64, i64 addrspace(1)* %bptr, align 4
-  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
-  %val = extractvalue { i64, i1 } %sadd, 0
-  %carry = extractvalue { i64, i1 } %sadd, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
diff --git a/test/CodeGen/R600/salu-to-valu.ll b/test/CodeGen/R600/salu-to-valu.ll
deleted file mode 100644
index 0b964957654..00000000000
--- a/test/CodeGen/R600/salu-to-valu.ll
+++ /dev/null
@@ -1,118 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-
-; In this test both the pointer and the offset operands to the
-; BUFFER_LOAD instructions end up being stored in vgprs.  This
-; requires us to add the pointer and offset together, store the
-; result in the offset operand (vaddr), and then store 0 in an
-; sgpr register pair and use that for the pointer operand
-; (low 64-bits of srsrc).
-
-; CHECK-LABEL: {{^}}mubuf:
-
-; Make sure we aren't using VGPRs for the source operand of s_mov_b64
-; CHECK-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v
-
-; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_*
-; instructions
-; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
-; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
-define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
-entry:
-  %0 = call i32 @llvm.r600.read.tidig.x() #1
-  %1 = call i32 @llvm.r600.read.tidig.y() #1
-  %2 = sext i32 %0 to i64
-  %3 = sext i32 %1 to i64
-  br label %loop
-
-loop:
-  %4 = phi i64 [0, %entry], [%5, %loop]
-  %5 = add i64 %2, %4
-  %6 = getelementptr i8, i8 addrspace(1)* %in, i64 %5
-  %7 = load i8, i8 addrspace(1)* %6, align 1
-  %8 = or i64 %5, 1
-  %9 = getelementptr i8, i8 addrspace(1)* %in, i64 %8
-  %10 = load i8, i8 addrspace(1)* %9, align 1
-  %11 = add i8 %7, %10
-  %12 = sext i8 %11 to i32
-  store i32 %12, i32 addrspace(1)* %out
-  %13 = icmp slt i64 %5, 10
-  br i1 %13, label %loop, label %done
-
-done:
-  ret void
-}
-
-declare i32 @llvm.r600.read.tidig.x() #1
-declare i32 @llvm.r600.read.tidig.y() #1
-
-attributes #1 = { nounwind readnone }
-
-; Test moving an SMRD instruction to the VALU
-
-; CHECK-LABEL: {{^}}smrd_valu:
-; CHECK: buffer_load_dword [[OUT:v[0-9]+]]
-; CHECK: buffer_store_dword [[OUT]]
-
-define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 addrspace(1)* %out) {
-entry:
-  %0 = icmp ne i32 %a, 0
-  br i1 %0, label %if, label %else
-
-if:
-  %1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
-  br label %endif
-
-else:
-  %2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
-  %3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %2
-  br label %endif
-
-endif:
-  %4 = phi i32 addrspace(2)*  [%1, %if], [%3, %else]
-  %5 = getelementptr i32, i32 addrspace(2)* %4, i32 3000
-  %6 = load i32, i32 addrspace(2)* %5
-  store i32 %6, i32 addrspace(1)* %out
-  ret void
-}
-
-; Test moving ann SMRD with an immediate offset to the VALU
-
-; CHECK-LABEL: {{^}}smrd_valu2:
-; CHECK: buffer_load_dword
-define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) {
-entry:
-  %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %1 = add i32 %0, 4
-  %2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %0, i32 4
-  %3 = load i32, i32 addrspace(2)* %2
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}s_load_imm_v8i32:
-; CHECK: buffer_load_dwordx4
-; CHECK: buffer_load_dwordx4
-define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) {
-entry:
-  %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
-  %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
-  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
-  %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4
-  store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32
-  ret void
-}
-
-; CHECK-LABEL: {{^}}s_load_imm_v16i32:
-; CHECK: buffer_load_dwordx4
-; CHECK: buffer_load_dwordx4
-; CHECK: buffer_load_dwordx4
-; CHECK: buffer_load_dwordx4
-define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) {
-entry:
-  %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
-  %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
-  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
-  %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4
-  store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32
-  ret void
-}
diff --git a/test/CodeGen/R600/scalar_to_vector.ll b/test/CodeGen/R600/scalar_to_vector.ll
deleted file mode 100644
index 0970e5d3063..00000000000
--- a/test/CodeGen/R600/scalar_to_vector.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-
-; FUNC-LABEL: {{^}}scalar_to_vector_v2i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: s_endpgm
-define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %tmp1 = load i32, i32 addrspace(1)* %in, align 4
-  %bc = bitcast i32 %tmp1 to <2 x i16>
-  %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}scalar_to_vector_v2f32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: s_endpgm
-define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %tmp1 = load float, float addrspace(1)* %in, align 4
-  %bc = bitcast float %tmp1 to <2 x i16>
-  %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
-  ret void
-}
-
-; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed
-; to produce one, but for some reason never made it to selection.
-
-
-; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-;   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
-;   %bc = bitcast i32 %tmp1 to <4 x i8>
-
-;   %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-;   store <8 x i8> %tmp2, <8 x i8> addrspace(1)* %out, align 4
-;   ret void
-; }
-
-; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
-;   %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0
-;   %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1
-;   %bc = bitcast <2 x i64> %newvec1 to <4 x i32>
-;   %add = add <4 x i32> %bc, <i32 1, i32 2, i32 3, i32 4>
-;   store <4 x i32> %add, <4 x i32> addrspace(1)* %out, align 16
-;   ret void
-; }
-
-; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
-;   %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0
-;   %bc = bitcast <4 x i32> %newvec0 to <8 x i16>
-;   %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>
-;   store <8 x i16> %add, <8 x i16> addrspace(1)* %out, align 16
-;   ret void
-; }
-
-; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
-;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
-;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
-;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
-;   store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16
-;   ret void
-; }
-
-; define void @scalar_to_vector_test6(<4 x i16> addrspace(1)* %out) nounwind {
-;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
-;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
-;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
-;   store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16
-;   ret void
-; }
diff --git a/test/CodeGen/R600/schedule-fs-loop-nested-if.ll b/test/CodeGen/R600/schedule-fs-loop-nested-if.ll
deleted file mode 100644
index 11e8f5176f4..00000000000
--- a/test/CodeGen/R600/schedule-fs-loop-nested-if.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
-;REQUIRES: asserts
-
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #1 {
-main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = extractelement <4 x float> %reg1, i32 2
-  %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = fcmp ult float %1, 0.000000e+00
-  %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00
-  %6 = fsub float -0.000000e+00, %5
-  %7 = fptosi float %6 to i32
-  %8 = bitcast i32 %7 to float
-  %9 = fcmp ult float %0, 5.700000e+01
-  %10 = select i1 %9, float 1.000000e+00, float 0.000000e+00
-  %11 = fsub float -0.000000e+00, %10
-  %12 = fptosi float %11 to i32
-  %13 = bitcast i32 %12 to float
-  %14 = bitcast float %8 to i32
-  %15 = bitcast float %13 to i32
-  %16 = and i32 %14, %15
-  %17 = bitcast i32 %16 to float
-  %18 = bitcast float %17 to i32
-  %19 = icmp ne i32 %18, 0
-  %20 = fcmp ult float %0, 0.000000e+00
-  %21 = select i1 %20, float 1.000000e+00, float 0.000000e+00
-  %22 = fsub float -0.000000e+00, %21
-  %23 = fptosi float %22 to i32
-  %24 = bitcast i32 %23 to float
-  %25 = bitcast float %24 to i32
-  %26 = icmp ne i32 %25, 0
-  br i1 %19, label %IF, label %ELSE
-
-IF:                                               ; preds = %main_body
-  %. = select i1 %26, float 0.000000e+00, float 1.000000e+00
-  %.18 = select i1 %26, float 1.000000e+00, float 0.000000e+00
-  br label %ENDIF
-
-ELSE:                                             ; preds = %main_body
-  br i1 %26, label %ENDIF, label %ELSE17
-
-ENDIF:                                            ; preds = %ELSE17, %ELSE, %IF
-  %temp1.0 = phi float [ %., %IF ], [ %48, %ELSE17 ], [ 0.000000e+00, %ELSE ]
-  %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %49, %ELSE17 ], [ 1.000000e+00, %ELSE ]
-  %temp.0 = phi float [ %.18, %IF ], [ %47, %ELSE17 ], [ 0.000000e+00, %ELSE ]
-  %27 = call float @llvm.AMDIL.clamp.(float %temp.0, float 0.000000e+00, float 1.000000e+00)
-  %28 = call float @llvm.AMDIL.clamp.(float %temp1.0, float 0.000000e+00, float 1.000000e+00)
-  %29 = call float @llvm.AMDIL.clamp.(float %temp2.0, float 0.000000e+00, float 1.000000e+00)
-  %30 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %31 = insertelement <4 x float> undef, float %27, i32 0
-  %32 = insertelement <4 x float> %31, float %28, i32 1
-  %33 = insertelement <4 x float> %32, float %29, i32 2
-  %34 = insertelement <4 x float> %33, float %30, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %34, i32 0, i32 0)
-  ret void
-
-ELSE17:                                           ; preds = %ELSE
-  %35 = fadd float 0.000000e+00, 0x3FC99999A0000000
-  %36 = fadd float 0.000000e+00, 0x3FC99999A0000000
-  %37 = fadd float 0.000000e+00, 0x3FC99999A0000000
-  %38 = fadd float %35, 0x3FC99999A0000000
-  %39 = fadd float %36, 0x3FC99999A0000000
-  %40 = fadd float %37, 0x3FC99999A0000000
-  %41 = fadd float %38, 0x3FC99999A0000000
-  %42 = fadd float %39, 0x3FC99999A0000000
-  %43 = fadd float %40, 0x3FC99999A0000000
-  %44 = fadd float %41, 0x3FC99999A0000000
-  %45 = fadd float %42, 0x3FC99999A0000000
-  %46 = fadd float %43, 0x3FC99999A0000000
-  %47 = fadd float %44, 0x3FC99999A0000000
-  %48 = fadd float %45, 0x3FC99999A0000000
-  %49 = fadd float %46, 0x3FC99999A0000000
-  br label %ENDIF
-}
-
-declare float @llvm.AMDIL.clamp.(float, float, float) #0
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { readnone }
-attributes #1 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/schedule-fs-loop-nested.ll b/test/CodeGen/R600/schedule-fs-loop-nested.ll
deleted file mode 100644
index 759197ca61f..00000000000
--- a/test/CodeGen/R600/schedule-fs-loop-nested.ll
+++ /dev/null
@@ -1,88 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
-;REQUIRES: asserts
-
-define void @main() {
-main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(9)* null
-  %1 = extractelement <4 x float> %0, i32 3
-  %2 = fptosi float %1 to i32
-  %3 = bitcast i32 %2 to float
-  %4 = bitcast float %3 to i32
-  %5 = sdiv i32 %4, 4
-  %6 = bitcast i32 %5 to float
-  %7 = bitcast float %6 to i32
-  %8 = mul i32 %7, 4
-  %9 = bitcast i32 %8 to float
-  %10 = bitcast float %9 to i32
-  %11 = sub i32 0, %10
-  %12 = bitcast i32 %11 to float
-  %13 = bitcast float %3 to i32
-  %14 = bitcast float %12 to i32
-  %15 = add i32 %13, %14
-  %16 = bitcast i32 %15 to float
-  %17 = load <4 x float>, <4 x float> addrspace(9)* null
-  %18 = extractelement <4 x float> %17, i32 0
-  %19 = load <4 x float>, <4 x float> addrspace(9)* null
-  %20 = extractelement <4 x float> %19, i32 1
-  %21 = load <4 x float>, <4 x float> addrspace(9)* null
-  %22 = extractelement <4 x float> %21, i32 2
-  br label %LOOP
-
-LOOP:                                             ; preds = %IF31, %main_body
-  %temp12.0 = phi float [ 0.000000e+00, %main_body ], [ %47, %IF31 ]
-  %temp6.0 = phi float [ %22, %main_body ], [ %temp6.1, %IF31 ]
-  %temp5.0 = phi float [ %20, %main_body ], [ %temp5.1, %IF31 ]
-  %temp4.0 = phi float [ %18, %main_body ], [ %temp4.1, %IF31 ]
-  %23 = bitcast float %temp12.0 to i32
-  %24 = bitcast float %6 to i32
-  %25 = icmp sge i32 %23, %24
-  %26 = sext i1 %25 to i32
-  %27 = bitcast i32 %26 to float
-  %28 = bitcast float %27 to i32
-  %29 = icmp ne i32 %28, 0
-  br i1 %29, label %IF, label %LOOP29
-
-IF:                                               ; preds = %LOOP
-  %30 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
-  %31 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
-  %32 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
-  %33 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %34 = insertelement <4 x float> undef, float %30, i32 0
-  %35 = insertelement <4 x float> %34, float %31, i32 1
-  %36 = insertelement <4 x float> %35, float %32, i32 2
-  %37 = insertelement <4 x float> %36, float %33, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %37, i32 0, i32 0)
-  ret void
-
-LOOP29:                                           ; preds = %LOOP, %ENDIF30
-  %temp6.1 = phi float [ %temp4.1, %ENDIF30 ], [ %temp6.0, %LOOP ]
-  %temp5.1 = phi float [ %temp6.1, %ENDIF30 ], [ %temp5.0, %LOOP ]
-  %temp4.1 = phi float [ %temp5.1, %ENDIF30 ], [ %temp4.0, %LOOP ]
-  %temp20.0 = phi float [ %50, %ENDIF30 ], [ 0.000000e+00, %LOOP ]
-  %38 = bitcast float %temp20.0 to i32
-  %39 = bitcast float %16 to i32
-  %40 = icmp sge i32 %38, %39
-  %41 = sext i1 %40 to i32
-  %42 = bitcast i32 %41 to float
-  %43 = bitcast float %42 to i32
-  %44 = icmp ne i32 %43, 0
-  br i1 %44, label %IF31, label %ENDIF30
-
-IF31:                                             ; preds = %LOOP29
-  %45 = bitcast float %temp12.0 to i32
-  %46 = add i32 %45, 1
-  %47 = bitcast i32 %46 to float
-  br label %LOOP
-
-ENDIF30:                                          ; preds = %LOOP29
-  %48 = bitcast float %temp20.0 to i32
-  %49 = add i32 %48, 1
-  %50 = bitcast i32 %49 to float
-  br label %LOOP29
-}
-
-declare float @llvm.AMDIL.clamp.(float, float, float) #0
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/schedule-fs-loop.ll b/test/CodeGen/R600/schedule-fs-loop.ll
deleted file mode 100644
index 28cc08abc02..00000000000
--- a/test/CodeGen/R600/schedule-fs-loop.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
-;REQUIRES: asserts
-
-define void @main() {
-main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(9)* null
-  %1 = extractelement <4 x float> %0, i32 3
-  %2 = fptosi float %1 to i32
-  %3 = bitcast i32 %2 to float
-  %4 = load <4 x float>, <4 x float> addrspace(9)* null
-  %5 = extractelement <4 x float> %4, i32 0
-  %6 = load <4 x float>, <4 x float> addrspace(9)* null
-  %7 = extractelement <4 x float> %6, i32 1
-  %8 = load <4 x float>, <4 x float> addrspace(9)* null
-  %9 = extractelement <4 x float> %8, i32 2
-  br label %LOOP
-
-LOOP:                                             ; preds = %ENDIF, %main_body
-  %temp4.0 = phi float [ %5, %main_body ], [ %temp5.0, %ENDIF ]
-  %temp5.0 = phi float [ %7, %main_body ], [ %temp6.0, %ENDIF ]
-  %temp6.0 = phi float [ %9, %main_body ], [ %temp4.0, %ENDIF ]
-  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %27, %ENDIF ]
-  %10 = bitcast float %temp8.0 to i32
-  %11 = bitcast float %3 to i32
-  %12 = icmp sge i32 %10, %11
-  %13 = sext i1 %12 to i32
-  %14 = bitcast i32 %13 to float
-  %15 = bitcast float %14 to i32
-  %16 = icmp ne i32 %15, 0
-  br i1 %16, label %IF, label %ENDIF
-
-IF:                                               ; preds = %LOOP
-  %17 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
-  %18 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
-  %19 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
-  %20 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %21 = insertelement <4 x float> undef, float %17, i32 0
-  %22 = insertelement <4 x float> %21, float %18, i32 1
-  %23 = insertelement <4 x float> %22, float %19, i32 2
-  %24 = insertelement <4 x float> %23, float %20, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %24, i32 0, i32 0)
-  ret void
-
-ENDIF:                                            ; preds = %LOOP
-  %25 = bitcast float %temp8.0 to i32
-  %26 = add i32 %25, 1
-  %27 = bitcast i32 %26 to float
-  br label %LOOP
-}
-
-declare float @llvm.AMDIL.clamp.(float, float, float) #0
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/schedule-global-loads.ll b/test/CodeGen/R600/schedule-global-loads.ll
deleted file mode 100644
index 3f728fd873b..00000000000
--- a/test/CodeGen/R600/schedule-global-loads.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
-
-
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; FIXME: This currently doesn't do a great job of clustering the
-; loads, which end up with extra moves between them. Right now, it
-; seems the only things areLoadsFromSameBasePtr is accomplishing is
-; ordering the loads so that the lower address loads come first.
-
-; FUNC-LABEL: {{^}}cluster_global_arg_loads:
-; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4
-; SI: buffer_store_dword [[REG0]]
-; SI: buffer_store_dword [[REG1]]
-define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
-  %load0 = load i32, i32 addrspace(1)* %ptr, align 4
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 1
-  %load1 = load i32, i32 addrspace(1)* %gep, align 4
-  store i32 %load0, i32 addrspace(1)* %out0, align 4
-  store i32 %load1, i32 addrspace(1)* %out1, align 4
-  ret void
-}
-
-; Test for a crach in SIInstrInfo::areLoadsFromSameBasePtr() when checking
-; an MUBUF load which does not have a vaddr operand.
-; FUNC-LABEL: {{^}}same_base_ptr_crash:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
-entry:
-  %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
-  %tmp0 = load i32, i32 addrspace(1)* %out
-  %tmp1 = load i32, i32 addrspace(1)* %out1
-  %tmp2 = add i32 %tmp0, %tmp1
-  store i32 %tmp2, i32 addrspace(1)* %out
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/schedule-if-2.ll b/test/CodeGen/R600/schedule-if-2.ll
deleted file mode 100644
index 54946509683..00000000000
--- a/test/CodeGen/R600/schedule-if-2.ll
+++ /dev/null
@@ -1,94 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
-;REQUIRES: asserts
-
-define void @main() {
-main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %1 = extractelement <4 x float> %0, i32 0
-  %2 = fadd float 1.000000e+03, %1
-  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %4 = extractelement <4 x float> %3, i32 0
-  %5 = bitcast float %4 to i32
-  %6 = icmp eq i32 %5, 0
-  %7 = sext i1 %6 to i32
-  %8 = bitcast i32 %7 to float
-  %9 = bitcast float %8 to i32
-  %10 = icmp ne i32 %9, 0
-  br i1 %10, label %IF, label %ELSE
-
-IF:                                               ; preds = %main_body
-  %11 = call float @fabs(float %2)
-  %12 = fcmp ueq float %11, 0x7FF0000000000000
-  %13 = select i1 %12, float 1.000000e+00, float 0.000000e+00
-  %14 = fsub float -0.000000e+00, %13
-  %15 = fptosi float %14 to i32
-  %16 = bitcast i32 %15 to float
-  %17 = bitcast float %16 to i32
-  %18 = icmp ne i32 %17, 0
-  %. = select i1 %18, float 0x36A0000000000000, float 0.000000e+00
-  %19 = fcmp une float %2, %2
-  %20 = select i1 %19, float 1.000000e+00, float 0.000000e+00
-  %21 = fsub float -0.000000e+00, %20
-  %22 = fptosi float %21 to i32
-  %23 = bitcast i32 %22 to float
-  %24 = bitcast float %23 to i32
-  %25 = icmp ne i32 %24, 0
-  %temp8.0 = select i1 %25, float 0x36A0000000000000, float 0.000000e+00
-  %26 = bitcast float %. to i32
-  %27 = sitofp i32 %26 to float
-  %28 = bitcast float %temp8.0 to i32
-  %29 = sitofp i32 %28 to float
-  %30 = fcmp ugt float %2, 0.000000e+00
-  %31 = select i1 %30, float 1.000000e+00, float %2
-  %32 = fcmp uge float %31, 0.000000e+00
-  %33 = select i1 %32, float %31, float -1.000000e+00
-  %34 = fadd float %33, 1.000000e+00
-  %35 = fmul float %34, 5.000000e-01
-  br label %ENDIF
-
-ELSE:                                             ; preds = %main_body
-  %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %37 = extractelement <4 x float> %36, i32 0
-  %38 = bitcast float %37 to i32
-  %39 = icmp eq i32 %38, 1
-  %40 = sext i1 %39 to i32
-  %41 = bitcast i32 %40 to float
-  %42 = bitcast float %41 to i32
-  %43 = icmp ne i32 %42, 0
-  br i1 %43, label %IF23, label %ENDIF
-
-ENDIF:                                            ; preds = %IF23, %ELSE, %IF
-  %temp4.0 = phi float [ %2, %IF ], [ %56, %IF23 ], [ 0.000000e+00, %ELSE ]
-  %temp5.0 = phi float [ %27, %IF ], [ %60, %IF23 ], [ 0.000000e+00, %ELSE ]
-  %temp6.0 = phi float [ %29, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ]
-  %temp7.0 = phi float [ %35, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ]
-  %44 = insertelement <4 x float> undef, float %temp4.0, i32 0
-  %45 = insertelement <4 x float> %44, float %temp5.0, i32 1
-  %46 = insertelement <4 x float> %45, float %temp6.0, i32 2
-  %47 = insertelement <4 x float> %46, float %temp7.0, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %47, i32 0, i32 0)
-  ret void
-
-IF23:                                             ; preds = %ELSE
-  %48 = fcmp ult float 0.000000e+00, %2
-  %49 = select i1 %48, float 1.000000e+00, float 0.000000e+00
-  %50 = fsub float -0.000000e+00, %49
-  %51 = fptosi float %50 to i32
-  %52 = bitcast i32 %51 to float
-  %53 = bitcast float %52 to i32
-  %54 = icmp ne i32 %53, 0
-  %.28 = select i1 %54, float 0x36A0000000000000, float 0.000000e+00
-  %55 = bitcast float %.28 to i32
-  %56 = sitofp i32 %55 to float
-  %57 = load <4 x float>, <4 x float> addrspace(8)* null
-  %58 = extractelement <4 x float> %57, i32 0
-  %59 = fsub float -0.000000e+00, %58
-  %60 = fadd float %2, %59
-  br label %ENDIF
-}
-
-declare float @fabs(float) #0
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { readonly }
diff --git a/test/CodeGen/R600/schedule-if.ll b/test/CodeGen/R600/schedule-if.ll
deleted file mode 100644
index 94c653c8f25..00000000000
--- a/test/CodeGen/R600/schedule-if.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
-;REQUIRES: asserts
-
-define void @main() {
-main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %1 = extractelement <4 x float> %0, i32 0
-  %2 = bitcast float %1 to i32
-  %3 = icmp eq i32 %2, 0
-  %4 = sext i1 %3 to i32
-  %5 = bitcast i32 %4 to float
-  %6 = bitcast float %5 to i32
-  %7 = icmp ne i32 %6, 0
-  br i1 %7, label %ENDIF, label %ELSE
-
-ELSE:                                             ; preds = %main_body
-  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %9 = extractelement <4 x float> %8, i32 0
-  %10 = bitcast float %9 to i32
-  %11 = icmp eq i32 %10, 1
-  %12 = sext i1 %11 to i32
-  %13 = bitcast i32 %12 to float
-  %14 = bitcast float %13 to i32
-  %15 = icmp ne i32 %14, 0
-  br i1 %15, label %IF13, label %ENDIF
-
-ENDIF:                                            ; preds = %IF13, %ELSE, %main_body
-  %temp.0 = phi float [ 1.000000e+03, %main_body ], [ 1.000000e+00, %IF13 ], [ 0.000000e+00, %ELSE ]
-  %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ]
-  %temp3.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ]
-  %16 = insertelement <4 x float> undef, float %temp.0, i32 0
-  %17 = insertelement <4 x float> %16, float %temp1.0, i32 1
-  %18 = insertelement <4 x float> %17, float 0.000000e+00, i32 2
-  %19 = insertelement <4 x float> %18, float %temp3.0, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0)
-  ret void
-
-IF13:                                             ; preds = %ELSE
-  %20 = load <4 x float>, <4 x float> addrspace(8)* null
-  %21 = extractelement <4 x float> %20, i32 0
-  %22 = fsub float -0.000000e+00, %21
-  %23 = fadd float 1.000000e+03, %22
-  br label %ENDIF
-}
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/R600/schedule-kernel-arg-loads.ll b/test/CodeGen/R600/schedule-kernel-arg-loads.ll
deleted file mode 100644
index 6b3e0814c38..00000000000
--- a/test/CodeGen/R600/schedule-kernel-arg-loads.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI --check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s
-
-; FUNC-LABEL: {{^}}cluster_arg_loads:
-; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
-; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; VI-NEXT: s_nop 0
-; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-NEXT: s_nop 0
-; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; VI-NEXT: s_nop 0
-; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
-define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
-  store i32 %x, i32 addrspace(1)* %out0, align 4
-  store i32 %y, i32 addrspace(1)* %out1, align 4
-  ret void
-}
-
-; Test for a crash in SIInstrInfo::areLoadsFromSameBasePtr() when
-; s_load_dwordx2 has a register offset
-
-; FUNC-LABEL: @same_base_ptr_crash
-; GCN: s_load_dwordx2
-; GCN: s_load_dwordx2
-; GCN: s_load_dwordx2
-; GCN: s_endpgm
-define void @same_base_ptr_crash(i64 addrspace(1)* %out,
-    i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7,
-    i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15,
-    i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23,
-    i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31,
-    i64 %arg32, i64 %arg33, i64 %arg34, i64 %arg35, i64 %arg36, i64 %arg37, i64 %arg38, i64 %arg39,
-    i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg45, i64 %arg46, i64 %arg47,
-    i64 %arg48, i64 %arg49, i64 %arg50, i64 %arg51, i64 %arg52, i64 %arg53, i64 %arg54, i64 %arg55,
-    i64 %arg56, i64 %arg57, i64 %arg58, i64 %arg59, i64 %arg60, i64 %arg61, i64 %arg62, i64 %arg63,
-    i64 %arg64, i64 %arg65, i64 %arg66, i64 %arg67, i64 %arg68, i64 %arg69, i64 %arg70, i64 %arg71,
-    i64 %arg72, i64 %arg73, i64 %arg74, i64 %arg75, i64 %arg76, i64 %arg77, i64 %arg78, i64 %arg79,
-    i64 %arg80, i64 %arg81, i64 %arg82, i64 %arg83, i64 %arg84, i64 %arg85, i64 %arg86, i64 %arg87,
-    i64 %arg88, i64 %arg89, i64 %arg90, i64 %arg91, i64 %arg92, i64 %arg93, i64 %arg94, i64 %arg95,
-    i64 %arg96, i64 %arg97, i64 %arg98, i64 %arg99, i64 %arg100, i64 %arg101, i64 %arg102, i64 %arg103,
-    i64 %arg104, i64 %arg105, i64 %arg106, i64 %arg107, i64 %arg108, i64 %arg109, i64 %arg110, i64 %arg111,
-    i64 %arg112, i64 %arg113, i64 %arg114, i64 %arg115, i64 %arg116, i64 %arg117, i64 %arg118, i64 %arg119,
-    i64 %arg120, i64 %arg121, i64 %arg122, i64 %arg123, i64 %arg124, i64 %arg125, i64 %arg126) {
-entry:
-  %value = add i64 %arg125, %arg126
-  store i64 %value, i64 addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
deleted file mode 100644
index 3863afda5dd..00000000000
--- a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
+++ /dev/null
@@ -1,163 +0,0 @@
-; XFAIL: *
-; REQUIRES: asserts
-; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
-
-declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
-
-
-; SI-LABEL: {{^}}main(
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
-main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 2
-  %2 = fcmp ult float %0, 0.000000e+00
-  %3 = select i1 %2, float 1.000000e+00, float 0.000000e+00
-  %4 = fsub float -0.000000e+00, %3
-  %5 = fptosi float %4 to i32
-  %6 = bitcast i32 %5 to float
-  %7 = bitcast float %6 to i32
-  %8 = icmp ne i32 %7, 0
-  br i1 %8, label %LOOP, label %ENDIF
-
-Flow1:                                            ; preds = %ENDIF19, %ENDIF16
-  %9 = phi float [ %115, %ENDIF19 ], [ undef, %ENDIF16 ]
-  %10 = phi float [ %114, %ENDIF19 ], [ undef, %ENDIF16 ]
-  %11 = phi float [ %113, %ENDIF19 ], [ undef, %ENDIF16 ]
-  %12 = phi float [ %112, %ENDIF19 ], [ undef, %ENDIF16 ]
-  %13 = phi float [ %111, %ENDIF19 ], [ undef, %ENDIF16 ]
-  %14 = phi i1 [ false, %ENDIF19 ], [ true, %ENDIF16 ]
-  br label %Flow
-
-Flow2:                                            ; preds = %Flow
-  br label %ENDIF
-
-ENDIF:                                            ; preds = %main_body, %Flow2
-  %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %104, %Flow2 ]
-  %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %103, %Flow2 ]
-  %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %102, %Flow2 ]
-  %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ]
-  %15 = extractelement <4 x float> %reg1, i32 1
-  %16 = extractelement <4 x float> %reg1, i32 3
-  %17 = load <4 x float>, <4 x float> addrspace(9)* null
-  %18 = extractelement <4 x float> %17, i32 0
-  %19 = fmul float %18, %0
-  %20 = load <4 x float>, <4 x float> addrspace(9)* null
-  %21 = extractelement <4 x float> %20, i32 1
-  %22 = fmul float %21, %0
-  %23 = load <4 x float>, <4 x float> addrspace(9)* null
-  %24 = extractelement <4 x float> %23, i32 2
-  %25 = fmul float %24, %0
-  %26 = load <4 x float>, <4 x float> addrspace(9)* null
-  %27 = extractelement <4 x float> %26, i32 3
-  %28 = fmul float %27, %0
-  %29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
-  %30 = extractelement <4 x float> %29, i32 0
-  %31 = fmul float %30, %15
-  %32 = fadd float %31, %19
-  %33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
-  %34 = extractelement <4 x float> %33, i32 1
-  %35 = fmul float %34, %15
-  %36 = fadd float %35, %22
-  %37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
-  %38 = extractelement <4 x float> %37, i32 2
-  %39 = fmul float %38, %15
-  %40 = fadd float %39, %25
-  %41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
-  %42 = extractelement <4 x float> %41, i32 3
-  %43 = fmul float %42, %15
-  %44 = fadd float %43, %28
-  %45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
-  %46 = extractelement <4 x float> %45, i32 0
-  %47 = fmul float %46, %1
-  %48 = fadd float %47, %32
-  %49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
-  %50 = extractelement <4 x float> %49, i32 1
-  %51 = fmul float %50, %1
-  %52 = fadd float %51, %36
-  %53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
-  %54 = extractelement <4 x float> %53, i32 2
-  %55 = fmul float %54, %1
-  %56 = fadd float %55, %40
-  %57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
-  %58 = extractelement <4 x float> %57, i32 3
-  %59 = fmul float %58, %1
-  %60 = fadd float %59, %44
-  %61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
-  %62 = extractelement <4 x float> %61, i32 0
-  %63 = fmul float %62, %16
-  %64 = fadd float %63, %48
-  %65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
-  %66 = extractelement <4 x float> %65, i32 1
-  %67 = fmul float %66, %16
-  %68 = fadd float %67, %52
-  %69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
-  %70 = extractelement <4 x float> %69, i32 2
-  %71 = fmul float %70, %16
-  %72 = fadd float %71, %56
-  %73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
-  %74 = extractelement <4 x float> %73, i32 3
-  %75 = fmul float %74, %16
-  %76 = fadd float %75, %60
-  %77 = insertelement <4 x float> undef, float %64, i32 0
-  %78 = insertelement <4 x float> %77, float %68, i32 1
-  %79 = insertelement <4 x float> %78, float %72, i32 2
-  %80 = insertelement <4 x float> %79, float %76, i32 3
-  call void @llvm.AMDGPU.barrier.local()
-  %81 = insertelement <4 x float> undef, float %temp.0, i32 0
-  %82 = insertelement <4 x float> %81, float %temp1.0, i32 1
-  %83 = insertelement <4 x float> %82, float %temp2.0, i32 2
-  %84 = insertelement <4 x float> %83, float %temp3.0, i32 3
-  call void @llvm.AMDGPU.barrier.local()
-  ret void
-
-LOOP:                                             ; preds = %main_body, %Flow
-  %temp.1 = phi float [ %109, %Flow ], [ 0.000000e+00, %main_body ]
-  %temp1.1 = phi float [ %108, %Flow ], [ 1.000000e+00, %main_body ]
-  %temp2.1 = phi float [ %107, %Flow ], [ 0.000000e+00, %main_body ]
-  %temp3.1 = phi float [ %106, %Flow ], [ 0.000000e+00, %main_body ]
-  %temp4.0 = phi float [ %105, %Flow ], [ -2.000000e+00, %main_body ]
-  %85 = fcmp uge float %temp4.0, %0
-  %86 = select i1 %85, float 1.000000e+00, float 0.000000e+00
-  %87 = fsub float -0.000000e+00, %86
-  %88 = fptosi float %87 to i32
-  %89 = bitcast i32 %88 to float
-  %90 = bitcast float %89 to i32
-  %91 = icmp ne i32 %90, 0
-  %92 = xor i1 %91, true
-  br i1 %92, label %ENDIF16, label %Flow
-
-ENDIF16:                                          ; preds = %LOOP
-  %93 = fcmp une float %1, %temp4.0
-  %94 = select i1 %93, float 1.000000e+00, float 0.000000e+00
-  %95 = fsub float -0.000000e+00, %94
-  %96 = fptosi float %95 to i32
-  %97 = bitcast i32 %96 to float
-  %98 = bitcast float %97 to i32
-  %99 = icmp ne i32 %98, 0
-  %100 = xor i1 %99, true
-  br i1 %100, label %ENDIF19, label %Flow1
-
-Flow:                                             ; preds = %Flow1, %LOOP
-  %101 = phi float [ %temp3.1, %Flow1 ], [ %temp3.1, %LOOP ]
-  %102 = phi float [ %temp2.1, %Flow1 ], [ %temp2.1, %LOOP ]
-  %103 = phi float [ %temp1.1, %Flow1 ], [ %temp1.1, %LOOP ]
-  %104 = phi float [ %temp.1, %Flow1 ], [ %temp.1, %LOOP ]
-  %105 = phi float [ %9, %Flow1 ], [ undef, %LOOP ]
-  %106 = phi float [ %10, %Flow1 ], [ undef, %LOOP ]
-  %107 = phi float [ %11, %Flow1 ], [ undef, %LOOP ]
-  %108 = phi float [ %12, %Flow1 ], [ undef, %LOOP ]
-  %109 = phi float [ %13, %Flow1 ], [ undef, %LOOP ]
-  %110 = phi i1 [ %14, %Flow1 ], [ true, %LOOP ]
-  br i1 %110, label %Flow2, label %LOOP
-
-ENDIF19:                                          ; preds = %ENDIF16
-  %111 = fadd float %temp.1, 1.000000e+00
-  %112 = fadd float %temp1.1, 0.000000e+00
-  %113 = fadd float %temp2.1, 0.000000e+00
-  %114 = fadd float %temp3.1, 0.000000e+00
-  %115 = fadd float %temp4.0, 1.000000e+00
-  br label %Flow1
-}
-
-attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop.ll
deleted file mode 100644
index 8d980dbf899..00000000000
--- a/test/CodeGen/R600/schedule-vs-if-nested-loop.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
-;REQUIRES: asserts
-
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
-main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = extractelement <4 x float> %reg1, i32 2
-  %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = fcmp ult float %0, 0.000000e+00
-  %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00
-  %6 = fsub float -0.000000e+00, %5
-  %7 = fptosi float %6 to i32
-  %8 = bitcast i32 %7 to float
-  %9 = bitcast float %8 to i32
-  %10 = icmp ne i32 %9, 0
-  br i1 %10, label %LOOP, label %ENDIF
-
-ENDIF:                                            ; preds = %ENDIF16, %LOOP, %main_body
-  %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %temp.1, %LOOP ], [ %temp.1, %ENDIF16 ]
-  %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %temp1.1, %LOOP ], [ %temp1.1, %ENDIF16 ]
-  %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %temp2.1, %LOOP ], [ %temp2.1, %ENDIF16 ]
-  %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %temp3.1, %LOOP ], [ %temp3.1, %ENDIF16 ]
-  %11 = load <4 x float>, <4 x float> addrspace(9)* null
-  %12 = extractelement <4 x float> %11, i32 0
-  %13 = fmul float %12, %0
-  %14 = load <4 x float>, <4 x float> addrspace(9)* null
-  %15 = extractelement <4 x float> %14, i32 1
-  %16 = fmul float %15, %0
-  %17 = load <4 x float>, <4 x float> addrspace(9)* null
-  %18 = extractelement <4 x float> %17, i32 2
-  %19 = fmul float %18, %0
-  %20 = load <4 x float>, <4 x float> addrspace(9)* null
-  %21 = extractelement <4 x float> %20, i32 3
-  %22 = fmul float %21, %0
-  %23 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
-  %24 = extractelement <4 x float> %23, i32 0
-  %25 = fmul float %24, %1
-  %26 = fadd float %25, %13
-  %27 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
-  %28 = extractelement <4 x float> %27, i32 1
-  %29 = fmul float %28, %1
-  %30 = fadd float %29, %16
-  %31 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
-  %32 = extractelement <4 x float> %31, i32 2
-  %33 = fmul float %32, %1
-  %34 = fadd float %33, %19
-  %35 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
-  %36 = extractelement <4 x float> %35, i32 3
-  %37 = fmul float %36, %1
-  %38 = fadd float %37, %22
-  %39 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
-  %40 = extractelement <4 x float> %39, i32 0
-  %41 = fmul float %40, %2
-  %42 = fadd float %41, %26
-  %43 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
-  %44 = extractelement <4 x float> %43, i32 1
-  %45 = fmul float %44, %2
-  %46 = fadd float %45, %30
-  %47 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
-  %48 = extractelement <4 x float> %47, i32 2
-  %49 = fmul float %48, %2
-  %50 = fadd float %49, %34
-  %51 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
-  %52 = extractelement <4 x float> %51, i32 3
-  %53 = fmul float %52, %2
-  %54 = fadd float %53, %38
-  %55 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
-  %56 = extractelement <4 x float> %55, i32 0
-  %57 = fmul float %56, %3
-  %58 = fadd float %57, %42
-  %59 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
-  %60 = extractelement <4 x float> %59, i32 1
-  %61 = fmul float %60, %3
-  %62 = fadd float %61, %46
-  %63 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
-  %64 = extractelement <4 x float> %63, i32 2
-  %65 = fmul float %64, %3
-  %66 = fadd float %65, %50
-  %67 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
-  %68 = extractelement <4 x float> %67, i32 3
-  %69 = fmul float %68, %3
-  %70 = fadd float %69, %54
-  %71 = insertelement <4 x float> undef, float %58, i32 0
-  %72 = insertelement <4 x float> %71, float %62, i32 1
-  %73 = insertelement <4 x float> %72, float %66, i32 2
-  %74 = insertelement <4 x float> %73, float %70, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %74, i32 60, i32 1)
-  %75 = insertelement <4 x float> undef, float %temp.0, i32 0
-  %76 = insertelement <4 x float> %75, float %temp1.0, i32 1
-  %77 = insertelement <4 x float> %76, float %temp2.0, i32 2
-  %78 = insertelement <4 x float> %77, float %temp3.0, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %78, i32 0, i32 2)
-  ret void
-
-LOOP:                                             ; preds = %main_body, %ENDIF19
-  %temp.1 = phi float [ %93, %ENDIF19 ], [ 0.000000e+00, %main_body ]
-  %temp1.1 = phi float [ %94, %ENDIF19 ], [ 1.000000e+00, %main_body ]
-  %temp2.1 = phi float [ %95, %ENDIF19 ], [ 0.000000e+00, %main_body ]
-  %temp3.1 = phi float [ %96, %ENDIF19 ], [ 0.000000e+00, %main_body ]
-  %temp4.0 = phi float [ %97, %ENDIF19 ], [ -2.000000e+00, %main_body ]
-  %79 = fcmp uge float %temp4.0, %0
-  %80 = select i1 %79, float 1.000000e+00, float 0.000000e+00
-  %81 = fsub float -0.000000e+00, %80
-  %82 = fptosi float %81 to i32
-  %83 = bitcast i32 %82 to float
-  %84 = bitcast float %83 to i32
-  %85 = icmp ne i32 %84, 0
-  br i1 %85, label %ENDIF, label %ENDIF16
-
-ENDIF16:                                          ; preds = %LOOP
-  %86 = fcmp une float %2, %temp4.0
-  %87 = select i1 %86, float 1.000000e+00, float 0.000000e+00
-  %88 = fsub float -0.000000e+00, %87
-  %89 = fptosi float %88 to i32
-  %90 = bitcast i32 %89 to float
-  %91 = bitcast float %90 to i32
-  %92 = icmp ne i32 %91, 0
-  br i1 %92, label %ENDIF, label %ENDIF19
-
-ENDIF19:                                          ; preds = %ENDIF16
-  %93 = fadd float %temp.1, 1.000000e+00
-  %94 = fadd float %temp1.1, 0.000000e+00
-  %95 = fadd float %temp2.1, 0.000000e+00
-  %96 = fadd float %temp3.1, 0.000000e+00
-  %97 = fadd float %temp4.0, 1.000000e+00
-  br label %LOOP
-}
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/scratch-buffer.ll b/test/CodeGen/R600/scratch-buffer.ll
deleted file mode 100644
index 56088718ada..00000000000
--- a/test/CodeGen/R600/scratch-buffer.ll
+++ /dev/null
@@ -1,87 +0,0 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
-
-; When a frame index offset is more than 12-bits, make sure we don't store
-; it in mubuf's offset field.
-
-; Also, make sure we use the same register for storing the scratch buffer addresss
-; for both stores. This register is allocated by the register scavenger, so we
-; should be able to reuse the same regiser for each scratch buffer access.
-
-; CHECK-LABEL: {{^}}legal_offset_fi:
-; CHECK: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0{{$}}
-; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
-; CHECK: v_mov_b32_e32 [[OFFSET]], 0x8000
-; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
-
-define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
-entry:
-  %scratch0 = alloca [8192 x i32]
-  %scratch1 = alloca [8192 x i32]
-
-  %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 0
-  store i32 1, i32* %scratchptr0
-
-  %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 0
-  store i32 2, i32* %scratchptr1
-
-  %cmp = icmp eq i32 %cond, 0
-  br i1 %cmp, label %if, label %else
-
-if:
-  %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset
-  %if_value = load i32, i32* %if_ptr
-  br label %done
-
-else:
-  %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset
-  %else_value = load i32, i32* %else_ptr
-  br label %done
-
-done:
-  %value = phi i32 [%if_value, %if], [%else_value, %else]
-  store i32 %value, i32 addrspace(1)* %out
-  ret void
-
-  ret void
-
-}
-
-; CHECK-LABEL: {{^}}legal_offset_fi_offset
-; CHECK: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
-; CHECK: v_add_i32_e32 [[OFFSET:v[0-9]+]], 0x8000
-; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
-
-define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
-entry:
-  %scratch0 = alloca [8192 x i32]
-  %scratch1 = alloca [8192 x i32]
-
-  %offset0 = load i32, i32 addrspace(1)* %offsets
-  %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %offset0
-  store i32 %offset0, i32* %scratchptr0
-
-  %offsetptr1 = getelementptr i32, i32 addrspace(1)* %offsets, i32 1
-  %offset1 = load i32, i32 addrspace(1)* %offsetptr1
-  %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %offset1
-  store i32 %offset1, i32* %scratchptr1
-
-  %cmp = icmp eq i32 %cond, 0
-  br i1 %cmp, label %if, label %else
-
-if:
-  %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset
-  %if_value = load i32, i32* %if_ptr
-  br label %done
-
-else:
-  %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset
-  %else_value = load i32, i32* %else_ptr
-  br label %done
-
-done:
-  %value = phi i32 [%if_value, %if], [%else_value, %else]
-  store i32 %value, i32 addrspace(1)* %out
-  ret void
-}
-
diff --git a/test/CodeGen/R600/sdiv.ll b/test/CodeGen/R600/sdiv.ll
deleted file mode 100644
index de645353a40..00000000000
--- a/test/CodeGen/R600/sdiv.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; The code generated by sdiv is long and complex and may frequently change.
-; The goal of this test is to make sure the ISel doesn't fail.
-;
-; This program was previously failing to compile when one of the selectcc
-; opcodes generated by the sdiv lowering was being legalized and optimized to:
-; selectcc Remainder -1, 0, -1, SETGT
-; This was fixed by adding an additional pattern in R600Instructions.td to
-; match this pattern with a CNDGE_INT.
-
-; FUNC-LABEL: {{^}}sdiv_i32:
-; EG: CF_END
-define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in
-  %den = load i32, i32 addrspace(1) * %den_ptr
-  %result = sdiv i32 %num, %den
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sdiv_i32_4:
-define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %num = load i32, i32 addrspace(1) * %in
-  %result = sdiv i32 %num, 4
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-; Multiply by a weird constant to make sure setIntDivIsCheap is
-; working.
-
-; FUNC-LABEL: {{^}}slow_sdiv_i32_3435:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
-; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]]
-; SI: v_add_i32
-; SI: v_lshrrev_b32
-; SI: v_ashrrev_i32
-; SI: v_add_i32
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %num = load i32, i32 addrspace(1) * %in
-  %result = sdiv i32 %num, 3435
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
-  %result = sdiv <2 x i32> %num, %den
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %result = sdiv <2 x i32> %num, <i32 4, i32 4>
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
-  %result = sdiv <4 x i32> %num, %den
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; Tests for 64-bit divide bypass.
-; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-;   %result = sdiv i64 %a, %b
-;   store i64 %result, i64 addrspace(1)* %out, align 8
-;   ret void
-; }
-
-; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-;   %result = srem i64 %a, %b
-;   store i64 %result, i64 addrspace(1)* %out, align 8
-;   ret void
-; }
-
-; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-;   %resultdiv = sdiv i64 %a, %b
-;   %resultrem = srem i64 %a, %b
-;   %result = add i64 %resultdiv, %resultrem
-;   store i64 %result, i64 addrspace(1)* %out, align 8
-;   ret void
-; }
diff --git a/test/CodeGen/R600/sdivrem24.ll b/test/CodeGen/R600/sdivrem24.ll
deleted file mode 100644
index ad5df39f550..00000000000
--- a/test/CodeGen/R600/sdivrem24.ll
+++ /dev/null
@@ -1,239 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}sdiv24_i8:
-; SI: v_cvt_f32_i32
-; SI: v_cvt_f32_i32
-; SI: v_rcp_f32
-; SI: v_cvt_i32_f32
-
-; EG: INT_TO_FLT
-; EG-DAG: INT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_INT
-define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %num = load i8, i8 addrspace(1) * %in
-  %den = load i8, i8 addrspace(1) * %den_ptr
-  %result = sdiv i8 %num, %den
-  store i8 %result, i8 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sdiv24_i16:
-; SI: v_cvt_f32_i32
-; SI: v_cvt_f32_i32
-; SI: v_rcp_f32
-; SI: v_cvt_i32_f32
-
-; EG: INT_TO_FLT
-; EG-DAG: INT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_INT
-define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
-  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
-  %num = load i16, i16 addrspace(1) * %in, align 2
-  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
-  %result = sdiv i16 %num, %den
-  store i16 %result, i16 addrspace(1)* %out, align 2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sdiv24_i32:
-; SI: v_cvt_f32_i32
-; SI: v_cvt_f32_i32
-; SI: v_rcp_f32
-; SI: v_cvt_i32_f32
-
-; EG: INT_TO_FLT
-; EG-DAG: INT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_INT
-define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 8
-  %den.i24.0 = shl i32 %den, 8
-  %num.i24 = ashr i32 %num.i24.0, 8
-  %den.i24 = ashr i32 %den.i24.0, 8
-  %result = sdiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sdiv25_i32:
-; SI-NOT: v_cvt_f32_i32
-; SI-NOT: v_rcp_f32
-
-; EG-NOT: INT_TO_FLT
-; EG-NOT: RECIP_IEEE
-define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 7
-  %den.i24.0 = shl i32 %den, 7
-  %num.i24 = ashr i32 %num.i24.0, 7
-  %den.i24 = ashr i32 %den.i24.0, 7
-  %result = sdiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_no_sdiv24_i32_1:
-; SI-NOT: v_cvt_f32_i32
-; SI-NOT: v_rcp_f32
-
-; EG-NOT: INT_TO_FLT
-; EG-NOT: RECIP_IEEE
-define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 8
-  %den.i24.0 = shl i32 %den, 7
-  %num.i24 = ashr i32 %num.i24.0, 8
-  %den.i24 = ashr i32 %den.i24.0, 7
-  %result = sdiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_no_sdiv24_i32_2:
-; SI-NOT: v_cvt_f32_i32
-; SI-NOT: v_rcp_f32
-
-; EG-NOT: INT_TO_FLT
-; EG-NOT: RECIP_IEEE
-define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 7
-  %den.i24.0 = shl i32 %den, 8
-  %num.i24 = ashr i32 %num.i24.0, 7
-  %den.i24 = ashr i32 %den.i24.0, 8
-  %result = sdiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}srem24_i8:
-; SI: v_cvt_f32_i32
-; SI: v_cvt_f32_i32
-; SI: v_rcp_f32
-; SI: v_cvt_i32_f32
-
-; EG: INT_TO_FLT
-; EG-DAG: INT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_INT
-define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %num = load i8, i8 addrspace(1) * %in
-  %den = load i8, i8 addrspace(1) * %den_ptr
-  %result = srem i8 %num, %den
-  store i8 %result, i8 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}srem24_i16:
-; SI: v_cvt_f32_i32
-; SI: v_cvt_f32_i32
-; SI: v_rcp_f32
-; SI: v_cvt_i32_f32
-
-; EG: INT_TO_FLT
-; EG-DAG: INT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_INT
-define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
-  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
-  %num = load i16, i16 addrspace(1) * %in, align 2
-  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
-  %result = srem i16 %num, %den
-  store i16 %result, i16 addrspace(1)* %out, align 2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}srem24_i32:
-; SI: v_cvt_f32_i32
-; SI: v_cvt_f32_i32
-; SI: v_rcp_f32
-; SI: v_cvt_i32_f32
-
-; EG: INT_TO_FLT
-; EG-DAG: INT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_INT
-define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 8
-  %den.i24.0 = shl i32 %den, 8
-  %num.i24 = ashr i32 %num.i24.0, 8
-  %den.i24 = ashr i32 %den.i24.0, 8
-  %result = srem i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}srem25_i32:
-; SI-NOT: v_cvt_f32_i32
-; SI-NOT: v_rcp_f32
-
-; EG-NOT: INT_TO_FLT
-; EG-NOT: RECIP_IEEE
-define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 7
-  %den.i24.0 = shl i32 %den, 7
-  %num.i24 = ashr i32 %num.i24.0, 7
-  %den.i24 = ashr i32 %den.i24.0, 7
-  %result = srem i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_no_srem24_i32_1:
-; SI-NOT: v_cvt_f32_i32
-; SI-NOT: v_rcp_f32
-
-; EG-NOT: INT_TO_FLT
-; EG-NOT: RECIP_IEEE
-define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 8
-  %den.i24.0 = shl i32 %den, 7
-  %num.i24 = ashr i32 %num.i24.0, 8
-  %den.i24 = ashr i32 %den.i24.0, 7
-  %result = srem i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_no_srem24_i32_2:
-; SI-NOT: v_cvt_f32_i32
-; SI-NOT: v_rcp_f32
-
-; EG-NOT: INT_TO_FLT
-; EG-NOT: RECIP_IEEE
-define void @test_no_srem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 7
-  %den.i24.0 = shl i32 %den, 8
-  %num.i24 = ashr i32 %num.i24.0, 7
-  %den.i24 = ashr i32 %den.i24.0, 8
-  %result = srem i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/sdivrem64.ll b/test/CodeGen/R600/sdivrem64.ll
deleted file mode 100644
index a9b2b7f9df5..00000000000
--- a/test/CodeGen/R600/sdivrem64.ll
+++ /dev/null
@@ -1,225 +0,0 @@
-;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
-;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
-;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-
-;FUNC-LABEL: {{^}}test_sdiv:
-;EG: RECIP_UINT
-;EG: LSHL {{.*}}, 1,
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN-NOT: v_mad_f32
-;SI-NOT: v_lshr_b64
-;VI-NOT: v_lshrrev_b64
-;GCN: s_endpgm
-define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
-  %result = sdiv i64 %x, %y
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-;FUNC-LABEL: {{^}}test_srem:
-;EG: RECIP_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: AND_INT {{.*}}, 1,
-
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN-NOT: v_mad_f32
-;SI-NOT: v_lshr_b64
-;VI-NOT: v_lshrrev_b64
-;GCN: s_endpgm
-define void @test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
-  %result = urem i64 %x, %y
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-;FUNC-LABEL: {{^}}test_sdiv3264:
-;EG: RECIP_UINT
-;EG-NOT: BFE_UINT
-
-;GCN-NOT: s_bfe_u32
-;GCN-NOT: v_mad_f32
-;SI-NOT: v_lshr_b64
-;VI-NOT: v_lshrrev_b64
-;GCN: s_endpgm
-define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
-  %1 = ashr i64 %x, 33
-  %2 = ashr i64 %y, 33
-  %result = sdiv i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-;FUNC-LABEL: {{^}}test_srem3264:
-;EG: RECIP_UINT
-;EG-NOT: BFE_UINT
-
-;GCN-NOT: s_bfe_u32
-;GCN-NOT: v_mad_f32
-;SI-NOT: v_lshr_b64
-;VI-NOT: v_lshrrev_b64
-;GCN: s_endpgm
-define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
-  %1 = ashr i64 %x, 33
-  %2 = ashr i64 %y, 33
-  %result = srem i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-;FUNC-LABEL: {{^}}test_sdiv2464:
-;EG: INT_TO_FLT
-;EG: INT_TO_FLT
-;EG: FLT_TO_INT
-;EG-NOT: RECIP_UINT
-;EG-NOT: BFE_UINT
-
-;GCN-NOT: s_bfe_u32
-;GCN: v_mad_f32
-;SI-NOT: v_lshr_b64
-;VI-NOT: v_lshrrev_b64
-;GCN: s_endpgm
-define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
-  %1 = ashr i64 %x, 40
-  %2 = ashr i64 %y, 40
-  %result = sdiv i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-;FUNC-LABEL: {{^}}test_srem2464:
-;EG: INT_TO_FLT
-;EG: INT_TO_FLT
-;EG: FLT_TO_INT
-;EG-NOT: RECIP_UINT
-;EG-NOT: BFE_UINT
-
-;GCN-NOT: s_bfe_u32
-;GCN: v_mad_f32
-;SI-NOT: v_lshr_b64
-;VI-NOT: v_lshrrev_b64
-;GCN: s_endpgm
-define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
-  %1 = ashr i64 %x, 40
-  %2 = ashr i64 %y, 40
-  %result = srem i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/select-i1.ll b/test/CodeGen/R600/select-i1.ll
deleted file mode 100644
index 6735394e93a..00000000000
--- a/test/CodeGen/R600/select-i1.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI
-
-; FUNC-LABEL: {{^}}select_i1:
-; SI: v_cndmask_b32
-; SI-NOT: v_cndmask_b32
-define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
-  %cmp = icmp ugt i32 %cond, 5
-  %sel = select i1 %cmp, i1 %a, i1 %b
-  store i1 %sel, i1 addrspace(1)* %out, align 4
-  ret void
-}
-
diff --git a/test/CodeGen/R600/select-vectors.ll b/test/CodeGen/R600/select-vectors.ll
deleted file mode 100644
index 59082c65cc8..00000000000
--- a/test/CodeGen/R600/select-vectors.ll
+++ /dev/null
@@ -1,156 +0,0 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; Test expansion of scalar selects on vectors.
-; Evergreen not enabled since it seems to be having problems with doubles.
-
-
-; FUNC-LABEL: {{^}}select_v4i8:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
-  %cmp = icmp eq i8 %c, 0
-  %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
-  store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}select_v4i16:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
-  %cmp = icmp eq i32 %c, 0
-  %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
-  store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}select_v2i32:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: buffer_store_dwordx2
-define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
-  %cmp = icmp eq i32 %c, 0
-  %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
-  store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}select_v4i32:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: buffer_store_dwordx4
-define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
-  %cmp = icmp eq i32 %c, 0
-  %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
-  store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}select_v8i32:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
-  %cmp = icmp eq i32 %c, 0
-  %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
-  store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}select_v2f32:
-; SI: buffer_store_dwordx2
-define void @select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
-  %cmp = icmp eq i32 %c, 0
-  %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
-  store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}select_v4f32:
-; SI: buffer_store_dwordx4
-define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
-  %cmp = icmp eq i32 %c, 0
-  %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
-  store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}select_v8f32:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
-  %cmp = icmp eq i32 %c, 0
-  %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
-  store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}select_v2f64:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
-  %cmp = icmp eq i32 %c, 0
-  %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
-  store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}select_v4f64:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
-  %cmp = icmp eq i32 %c, 0
-  %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
-  store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FUNC-LABEL: {{^}}select_v8f64:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
-  %cmp = icmp eq i32 %c, 0
-  %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
-  store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
-  ret void
-}
diff --git a/test/CodeGen/R600/select.ll b/test/CodeGen/R600/select.ll
deleted file mode 100644
index 45f3cd5a7ac..00000000000
--- a/test/CodeGen/R600/select.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-; Normally icmp + select is optimized to select_cc, when this happens the
-; DAGLegalizer never sees the select and doesn't have a chance to leaglize it.
-;
-; In order to avoid the select_cc optimization, this test case calculates the
-; condition for the select in a separate basic block.
-
-; FUNC-LABEL: {{^}}select:
-; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
-; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
-; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
-; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
-; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
-; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
-define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
-                     <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out,
-                     <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out,
-                     i32 %cond) {
-entry:
-  br label %for
-body:
-  %inc = add i32 %i, 1
-  %br_cmp.i = icmp eq i1 %br_cmp, 0
-  br label %for
-for:
-  %i = phi i32 [ %inc, %body], [ 0, %entry ]
-  %br_cmp = phi i1 [ %br_cmp.i, %body ], [ 0, %entry ]
-  %0 = icmp eq i32 %cond, %i
-  %1 = select i1 %br_cmp, i32 2, i32 3
-  %2 = select i1 %br_cmp, float 2.0 , float 5.0
-  %3 = select i1 %br_cmp, <2 x i32> <i32 2, i32 3>, <2 x i32> <i32 4, i32 5>
-  %4 = select i1 %br_cmp, <2 x float> <float 2.0, float 3.0>, <2 x float> <float 4.0, float 5.0>
-  %5 = select i1 %br_cmp, <4 x i32> <i32 2 , i32 3, i32 4, i32 5>, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-  %6 = select i1 %br_cmp, <4 x float> <float 2.0, float 3.0, float 4.0, float 5.0>, <4 x float> <float 6.0, float 7.0, float 8.0, float 9.0>
-  br i1 %0, label %body, label %done
-
-done:
-  store i32 %1, i32 addrspace(1)* %i32out
-  store float %2, float addrspace(1)* %f32out
-  store <2 x i32> %3, <2 x i32> addrspace(1)* %v2i32out
-  store <2 x float> %4, <2 x float> addrspace(1)* %v2f32out
-  store <4 x i32> %5, <4 x i32> addrspace(1)* %v4i32out
-  store <4 x float> %6, <4 x float> addrspace(1)* %v4f32out
-  ret void
-}
diff --git a/test/CodeGen/R600/select64.ll b/test/CodeGen/R600/select64.ll
deleted file mode 100644
index 5cebb30dc72..00000000000
--- a/test/CodeGen/R600/select64.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; CHECK-LABEL: {{^}}select0:
-; i64 select should be split into two i32 selects, and we shouldn't need
-; to use a shfit to extract the hi dword of the input.
-; CHECK-NOT: s_lshr_b64
-; CHECK: v_cndmask
-; CHECK: v_cndmask
-define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
-entry:
-  %0 = icmp ugt i32 %cond, 5
-  %1 = select i1 %0, i64 0, i64 %in
-  store i64 %1, i64 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}select_trunc_i64:
-; CHECK: v_cndmask_b32
-; CHECK-NOT: v_cndmask_b32
-define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
-  %cmp = icmp ugt i32 %cond, 5
-  %sel = select i1 %cmp, i64 0, i64 %in
-  %trunc = trunc i64 %sel to i32
-  store i32 %trunc, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; CHECK-LABEL: {{^}}select_trunc_i64_2:
-; CHECK: v_cndmask_b32
-; CHECK-NOT: v_cndmask_b32
-define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
-  %cmp = icmp ugt i32 %cond, 5
-  %sel = select i1 %cmp, i64 %a, i64 %b
-  %trunc = trunc i64 %sel to i32
-  store i32 %trunc, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v_select_trunc_i64_2:
-; CHECK: v_cndmask_b32
-; CHECK-NOT: v_cndmask_b32
-define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %cmp = icmp ugt i32 %cond, 5
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
-  %b = load i64, i64 addrspace(1)* %bptr, align 8
-  %sel = select i1 %cmp, i64 %a, i64 %b
-  %trunc = trunc i64 %sel to i32
-  store i32 %trunc, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v_select_i64_split_imm:
-; CHECK: s_mov_b32 [[SHI:s[0-9]+]], 63
-; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0
-; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]]
-; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]]
-; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}}
-; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}}
-; CHECK: s_endpgm
-define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %cmp = icmp ugt i32 %cond, 5
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
-  %b = load i64, i64 addrspace(1)* %bptr, align 8
-  %sel = select i1 %cmp, i64 %a, i64 270582939648 ; 63 << 32
-  store i64 %sel, i64 addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/selectcc-cnd.ll b/test/CodeGen/R600/selectcc-cnd.ll
deleted file mode 100644
index 94d0ace7569..00000000000
--- a/test/CodeGen/R600/selectcc-cnd.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK-NOT: SETE
-;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x,
-;CHECK: 1073741824
-define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %1 = load float, float addrspace(1)* %in
-  %2 = fcmp oeq float %1, 0.0
-  %3 = select i1 %2, float 1.0, float 2.0
-  store float %3, float addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/selectcc-cnde-int.ll b/test/CodeGen/R600/selectcc-cnde-int.ll
deleted file mode 100644
index 58a4ee7d62b..00000000000
--- a/test/CodeGen/R600/selectcc-cnde-int.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK-NOT: SETE_INT
-;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x,
-;CHECK-NEXT: 2
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %1 = load i32, i32 addrspace(1)* %in
-  %2 = icmp eq i32 %1, 0
-  %3 = select i1 %2, i32 1, i32 2
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/selectcc-icmp-select-float.ll b/test/CodeGen/R600/selectcc-icmp-select-float.ll
deleted file mode 100644
index e870ee891e6..00000000000
--- a/test/CodeGen/R600/selectcc-icmp-select-float.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; Note additional optimizations may cause this SGT to be replaced with a
-; CND* instruction.
-; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, literal.x,
-; CHECK-NEXT: -1
-; Test a selectcc with i32 LHS/RHS and float True/False
-
-define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %0 = load i32, i32 addrspace(1)* %in
-  %1 = icmp sge i32 %0, 0
-  %2 = select i1 %1, float 1.0, float 0.0
-  store float %2, float addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/selectcc-opt.ll b/test/CodeGen/R600/selectcc-opt.ll
deleted file mode 100644
index 65be4a626a1..00000000000
--- a/test/CodeGen/R600/selectcc-opt.ll
+++ /dev/null
@@ -1,80 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-; FUNC-LABEL: {{^}}test_a:
-; EG-NOT: CND
-; EG: SET{{[NEQGTL]+}}_DX10
-
-define void @test_a(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp olt float %in, 0.000000e+00
-  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
-  %3 = fptosi float %2 to i32
-  %4 = bitcast i32 %3 to float
-  %5 = bitcast float %4 to i32
-  %6 = icmp ne i32 %5, 0
-  br i1 %6, label %IF, label %ENDIF
-
-IF:
-  %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  store i32 0, i32 addrspace(1)* %7
-  br label %ENDIF
-
-ENDIF:
-  store i32 0, i32 addrspace(1)* %out
-  ret void
-}
-
-; Same as test_a, but the branch labels are swapped to produce the inverse cc
-; for the icmp instruction
-
-; EG-LABEL: {{^}}test_b:
-; EG: SET{{[GTEQN]+}}_DX10
-; EG-NEXT: PRED_
-; EG-NEXT: ALU clause starting
-define void @test_b(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp olt float %in, 0.0
-  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
-  %3 = fptosi float %2 to i32
-  %4 = bitcast i32 %3 to float
-  %5 = bitcast float %4 to i32
-  %6 = icmp ne i32 %5, 0
-  br i1 %6, label %ENDIF, label %IF
-
-IF:
-  %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  store i32 0, i32 addrspace(1)* %7
-  br label %ENDIF
-
-ENDIF:
-  store i32 0, i32 addrspace(1)* %out
-  ret void
-}
-
-; Test a CND*_INT instruction with float true/false values
-; EG-LABEL: {{^}}test_c:
-; EG: CND{{[GTE]+}}_INT
-define void @test_c(float addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = icmp sgt i32 %in, 0
-  %1 = select i1 %0, float 2.0, float 3.0
-  store float %1, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}selectcc_bool:
-; SI: v_cmp_ne_i32
-; SI-NEXT: v_cndmask_b32_e64
-; SI-NOT: cmp
-; SI-NOT: cndmask
-define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %icmp0 = icmp ne i32 %a, %b
-  %ext = select i1 %icmp0, i32 -1, i32 0
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/selectcc.ll b/test/CodeGen/R600/selectcc.ll
deleted file mode 100644
index f378e15dd76..00000000000
--- a/test/CodeGen/R600/selectcc.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}selectcc_i64:
-; EG: XOR_INT
-; EG: XOR_INT
-; EG: OR_INT
-; EG: CNDE_INT
-; EG: CNDE_INT
-; SI: v_cmp_eq_i64
-; SI: v_cndmask
-; SI: v_cndmask
-define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
-entry:
-  %0 = icmp eq i64 %lhs, %rhs
-  %1 = select i1 %0, i64 %true, i64 %false
-  store i64 %1, i64 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/set-dx10.ll b/test/CodeGen/R600/set-dx10.ll
deleted file mode 100644
index 53694dcffa6..00000000000
--- a/test/CodeGen/R600/set-dx10.ll
+++ /dev/null
@@ -1,161 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; These tests check that floating point comparisons which are used by select
-; to store integer true (-1) and false (0) values are lowered to one of the
-; SET*DX10 instructions.
-
-; CHECK: {{^}}fcmp_une_select_fptosi:
-; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp une float %in, 5.0
-  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
-  %3 = fptosi float %2 to i32
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}fcmp_une_select_i32:
-; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp une float %in, 5.0
-  %1 = select i1 %0, i32 -1, i32 0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}fcmp_oeq_select_fptosi:
-; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp oeq float %in, 5.0
-  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
-  %3 = fptosi float %2 to i32
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}fcmp_oeq_select_i32:
-; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp oeq float %in, 5.0
-  %1 = select i1 %0, i32 -1, i32 0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}fcmp_ogt_select_fptosi:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp ogt float %in, 5.0
-  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
-  %3 = fptosi float %2 to i32
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}fcmp_ogt_select_i32:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp ogt float %in, 5.0
-  %1 = select i1 %0, i32 -1, i32 0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}fcmp_oge_select_fptosi:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp oge float %in, 5.0
-  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
-  %3 = fptosi float %2 to i32
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}fcmp_oge_select_i32:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp oge float %in, 5.0
-  %1 = select i1 %0, i32 -1, i32 0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}fcmp_ole_select_fptosi:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp ole float %in, 5.0
-  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
-  %3 = fptosi float %2 to i32
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}fcmp_ole_select_i32:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp ole float %in, 5.0
-  %1 = select i1 %0, i32 -1, i32 0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}fcmp_olt_select_fptosi:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp olt float %in, 5.0
-  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
-  %3 = fptosi float %2 to i32
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK: {{^}}fcmp_olt_select_i32:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp olt float %in, 5.0
-  %1 = select i1 %0, i32 -1, i32 0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/setcc-equivalent.ll b/test/CodeGen/R600/setcc-equivalent.ll
deleted file mode 100644
index 11ea793650c..00000000000
--- a/test/CodeGen/R600/setcc-equivalent.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
-
-; EG-LABEL: {{^}}and_setcc_setcc_i32:
-; EG: AND_INT
-; EG-NEXT: SETE_INT
-define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-  %cmp1 = icmp eq i32 %a, -1
-  %cmp2 = icmp eq i32 %b, -1
-  %and = and i1 %cmp1, %cmp2
-  %ext = sext i1 %and to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; EG-LABEL: {{^}}and_setcc_setcc_v4i32:
-; EG: AND_INT
-; EG: AND_INT
-; EG: SETE_INT
-; EG: AND_INT
-; EG: SETE_INT
-; EG: AND_INT
-; EG: SETE_INT
-define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
-  %cmp1 = icmp eq <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
-  %cmp2 = icmp eq <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
-  %and = and <4 x i1> %cmp1, %cmp2
-  %ext = sext <4 x i1> %and to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/setcc-opt.ll b/test/CodeGen/R600/setcc-opt.ll
deleted file mode 100644
index 4e6a10d6b78..00000000000
--- a/test/CodeGen/R600/setcc-opt.ll
+++ /dev/null
@@ -1,236 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0:
-; GCN-NOT: v_cmp
-; GCN: v_cmp_ne_i32_e32 vcc,
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN-NEXT:buffer_store_byte [[RESULT]]
-; GCN-NEXT: s_endpgm
-
-; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
-; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
-define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %icmp0 = icmp eq i32 %a, %b
-  %ext = sext i1 %icmp0 to i32
-  %icmp1 = icmp eq i32 %ext, 0
-  store i1 %icmp1, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_bool_icmp_ne_0:
-; GCN-NOT: v_cmp
-; GCN: v_cmp_ne_i32_e32 vcc,
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN-NEXT: buffer_store_byte [[RESULT]]
-; GCN-NEXT: s_endpgm
-
-; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
-; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
-define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %icmp0 = icmp ne i32 %a, %b
-  %ext = sext i1 %icmp0 to i32
-  %icmp1 = icmp ne i32 %ext, 0
-  store i1 %icmp1, i1 addrspace(1)* %out
-  ret void
-}
-
-; This really folds away to false
-; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1:
-; GCN: v_cmp_eq_i32_e32 vcc,
-; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
-; GCN-NEXT: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
-; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
-; GCN-NEXT: buffer_store_byte [[TMP]]
-; GCN-NEXT: s_endpgm
-define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %icmp0 = icmp eq i32 %a, %b
-  %ext = sext i1 %icmp0 to i32
-  %icmp1 = icmp eq i32 %ext, 1
-  store i1 %icmp1, i1 addrspace(1)* %out
-  ret void
-}
-
-; This really folds away to true
-; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1:
-; GCN: v_cmp_ne_i32_e32 vcc,
-; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
-; GCN-NEXT: v_cmp_ne_i32_e32 vcc, 1, [[TMP]]{{$}}
-; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
-; GCN-NEXT: buffer_store_byte [[TMP]]
-; GCN-NEXT: s_endpgm
-define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %icmp0 = icmp ne i32 %a, %b
-  %ext = sext i1 %icmp0 to i32
-  %icmp1 = icmp ne i32 %ext, 1
-  store i1 %icmp1, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zext_bool_icmp_eq_0:
-; GCN-NOT: v_cmp
-; GCN: v_cmp_ne_i32_e32 vcc,
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN-NEXT: buffer_store_byte [[RESULT]]
-; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %icmp0 = icmp eq i32 %a, %b
-  %ext = zext i1 %icmp0 to i32
-  %icmp1 = icmp eq i32 %ext, 0
-  store i1 %icmp1, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zext_bool_icmp_ne_0:
-; GCN-NOT: v_cmp
-; GCN: v_cmp_ne_i32_e32 vcc,
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN-NEXT: buffer_store_byte [[RESULT]]
-; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %icmp0 = icmp ne i32 %a, %b
-  %ext = zext i1 %icmp0 to i32
-  %icmp1 = icmp ne i32 %ext, 0
-  store i1 %icmp1, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zext_bool_icmp_eq_1:
-; GCN-NOT: v_cmp
-; GCN: v_cmp_eq_i32_e32 vcc,
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN-NEXT: buffer_store_byte [[RESULT]]
-; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %icmp0 = icmp eq i32 %a, %b
-  %ext = zext i1 %icmp0 to i32
-  %icmp1 = icmp eq i32 %ext, 1
-  store i1 %icmp1, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zext_bool_icmp_ne_1:
-; GCN-NOT: v_cmp
-; GCN: v_cmp_eq_i32_e32 vcc,
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN-NEXT: buffer_store_byte [[RESULT]]
-define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %icmp0 = icmp ne i32 %a, %b
-  %ext = zext i1 %icmp0 to i32
-  %icmp1 = icmp ne i32 %ext, 1
-  store i1 %icmp1, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
-; GCN: v_cmp_ne_i32_e32 vcc, 2, [[VB]]{{$}}
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN: buffer_store_byte
-; GCN: s_endpgm
-define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %icmp0 = icmp ne i32 %a, %b
-  %ext = sext i1 %icmp0 to i32
-  %icmp1 = icmp ne i32 %ext, 2
-  store i1 %icmp1, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}cmp_zext_k_i8max:
-; GCN: buffer_load_ubyte [[B:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
-; GCN: v_mov_b32_e32 [[K255:v[0-9]+]], 0xff{{$}}
-; GCN: v_cmp_ne_i32_e32 vcc, [[K255]], [[B]]
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN-NEXT: buffer_store_byte [[RESULT]]
-; GCN: s_endpgm
-define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
-  %b.ext = zext i8 %b to i32
-  %icmp0 = icmp ne i32 %b.ext, 255
-  store i1 %icmp0, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}cmp_sext_k_neg1:
-; GCN: buffer_load_sbyte [[B:v[0-9]+]]
-; GCN: v_cmp_ne_i32_e32 vcc, -1, [[B]]{{$}}
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN-NEXT: buffer_store_byte [[RESULT]]
-; GCN: s_endpgm
-define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
-  %b = load i8, i8 addrspace(1)* %b.ptr
-  %b.ext = sext i8 %b to i32
-  %icmp0 = icmp ne i32 %b.ext, -1
-  store i1 %icmp0, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_sext_arg:
-; GCN: s_load_dword [[B:s[0-9]+]]
-; GCN: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -1, [[B]]
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
-; GCN-NEXT: buffer_store_byte [[RESULT]]
-; GCN: s_endpgm
-define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
-  %b.ext = sext i8 %b to i32
-  %icmp0 = icmp ne i32 %b.ext, -1
-  store i1 %icmp0, i1 addrspace(1)* %out
-  ret void
-}
-
-; FIXME: This ends up doing a buffer_load_ubyte, and and compare to
-; 255. Seems to be because of ordering problems when not allowing load widths to be reduced.
-; Should do a buffer_load_sbyte and compare with -1
-
-; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg:
-; GCN-DAG: buffer_load_ubyte [[B:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff{{$}}
-; GCN: v_cmp_ne_i32_e32 vcc, [[K]], [[B]]{{$}}
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN-NEXT: buffer_store_byte [[RESULT]]
-; GCN: s_endpgm
-define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
-  %b.ext = sext i8 %b to i32
-  %icmp0 = icmp ne i32 %b.ext, -1
-  store i1 %icmp0, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}cmp_zext_k_neg1:
-; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
-; GCN: buffer_store_byte [[RESULT]]
-; GCN: s_endpgm
-define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
-  %b.ext = zext i8 %b to i32
-  %icmp0 = icmp ne i32 %b.ext, -1
-  store i1 %icmp0, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zext_bool_icmp_ne_k:
-; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
-; GCN: buffer_store_byte [[RESULT]]
-; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %icmp0 = icmp ne i32 %a, %b
-  %ext = zext i1 %icmp0 to i32
-  %icmp1 = icmp ne i32 %ext, 2
-  store i1 %icmp1, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zext_bool_icmp_eq_k:
-; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_byte [[RESULT]]
-; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %icmp0 = icmp ne i32 %a, %b
-  %ext = zext i1 %icmp0 to i32
-  %icmp1 = icmp eq i32 %ext, 2
-  store i1 %icmp1, i1 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
deleted file mode 100644
index f33a82df5ff..00000000000
--- a/test/CodeGen/R600/setcc.ll
+++ /dev/null
@@ -1,377 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; FUNC-LABEL: {{^}}setcc_v2i32:
-; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
-; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
-
-define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
-  %result = icmp eq <2 x i32> %a, %b
-  %sext = sext <2 x i1> %result to <2 x i32>
-  store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}setcc_v4i32:
-; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
-  %result = icmp eq <4 x i32> %a, %b
-  %sext = sext <4 x i1> %result to <4 x i32>
-  store <4 x i32> %sext, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-;;;==========================================================================;;;
-;; Float comparisons
-;;;==========================================================================;;;
-
-; FUNC-LABEL: {{^}}f32_oeq:
-; R600: SETE_DX10
-; SI: v_cmp_eq_f32
-define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fcmp oeq float %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f32_ogt:
-; R600: SETGT_DX10
-; SI: v_cmp_gt_f32
-define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fcmp ogt float %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f32_oge:
-; R600: SETGE_DX10
-; SI: v_cmp_ge_f32
-define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fcmp oge float %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f32_olt:
-; R600: SETGT_DX10
-; SI: v_cmp_lt_f32
-define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fcmp olt float %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f32_ole:
-; R600: SETGE_DX10
-; SI: v_cmp_le_f32
-define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fcmp ole float %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f32_one:
-; R600-DAG: SETE_DX10
-; R600-DAG: SETE_DX10
-; R600-DAG: AND_INT
-; R600-DAG: SETNE_DX10
-; R600-DAG: AND_INT
-; R600-DAG: SETNE_INT
-
-; SI: v_cmp_lg_f32_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fcmp one float %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f32_ord:
-; R600-DAG: SETE_DX10
-; R600-DAG: SETE_DX10
-; R600-DAG: AND_INT
-; R600-DAG: SETNE_INT
-; SI: v_cmp_o_f32
-define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fcmp ord float %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f32_ueq:
-; R600-DAG: SETNE_DX10
-; R600-DAG: SETNE_DX10
-; R600-DAG: OR_INT
-; R600-DAG: SETE_DX10
-; R600-DAG: OR_INT
-; R600-DAG: SETNE_INT
-
-; SI: v_cmp_nlg_f32_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fcmp ueq float %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f32_ugt:
-; R600: SETGE
-; R600: SETE_DX10
-; SI: v_cmp_nle_f32_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fcmp ugt float %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f32_uge:
-; R600: SETGT
-; R600: SETE_DX10
-
-; SI: v_cmp_nlt_f32_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fcmp uge float %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f32_ult:
-; R600: SETGE
-; R600: SETE_DX10
-
-; SI: v_cmp_nge_f32_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fcmp ult float %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f32_ule:
-; R600: SETGT
-; R600: SETE_DX10
-
-; SI: v_cmp_ngt_f32_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fcmp ule float %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f32_une:
-; R600: SETNE_DX10
-; SI: v_cmp_neq_f32
-define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fcmp une float %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f32_uno:
-; R600: SETNE_DX10
-; R600: SETNE_DX10
-; R600: OR_INT
-; R600: SETNE_INT
-; SI: v_cmp_u_f32
-define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fcmp uno float %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-;;;==========================================================================;;;
-;; 32-bit integer comparisons
-;;;==========================================================================;;;
-
-; FUNC-LABEL: {{^}}i32_eq:
-; R600: SETE_INT
-; SI: v_cmp_eq_i32
-define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %0 = icmp eq i32 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i32_ne:
-; R600: SETNE_INT
-; SI: v_cmp_ne_i32
-define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %0 = icmp ne i32 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i32_ugt:
-; R600: SETGT_UINT
-; SI: v_cmp_gt_u32
-define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %0 = icmp ugt i32 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i32_uge:
-; R600: SETGE_UINT
-; SI: v_cmp_ge_u32
-define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %0 = icmp uge i32 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i32_ult:
-; R600: SETGT_UINT
-; SI: v_cmp_lt_u32
-define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %0 = icmp ult i32 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i32_ule:
-; R600: SETGE_UINT
-; SI: v_cmp_le_u32
-define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %0 = icmp ule i32 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i32_sgt:
-; R600: SETGT_INT
-; SI: v_cmp_gt_i32
-define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %0 = icmp sgt i32 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i32_sge:
-; R600: SETGE_INT
-; SI: v_cmp_ge_i32
-define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %0 = icmp sge i32 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i32_slt:
-; R600: SETGT_INT
-; SI: v_cmp_lt_i32
-define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %0 = icmp slt i32 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i32_sle:
-; R600: SETGE_INT
-; SI: v_cmp_le_i32
-define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %0 = icmp sle i32 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FIXME: This does 4 compares
-; FUNC-LABEL: {{^}}v3i32_eq:
-; SI-DAG: v_cmp_eq_i32
-; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
-; SI-DAG: v_cmp_eq_i32
-; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
-; SI-DAG: v_cmp_eq_i32
-; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
-; SI: s_endpgm
-define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid
-  %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid
-  %gep.out = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid
-  %a = load <3 x i32>, <3 x i32> addrspace(1)* %gep.a
-  %b = load <3 x i32>, <3 x i32> addrspace(1)* %gep.b
-  %cmp = icmp eq <3 x i32> %a, %b
-  %ext = sext <3 x i1> %cmp to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(1)* %gep.out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v3i8_eq:
-; SI-DAG: v_cmp_eq_i32
-; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
-; SI-DAG: v_cmp_eq_i32
-; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
-; SI-DAG: v_cmp_eq_i32
-; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
-; SI: s_endpgm
-define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid
-  %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid
-  %gep.out = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %out, i32 %tid
-  %a = load <3 x i8>, <3 x i8> addrspace(1)* %gep.a
-  %b = load <3 x i8>, <3 x i8> addrspace(1)* %gep.b
-  %cmp = icmp eq <3 x i8> %a, %b
-  %ext = sext <3 x i1> %cmp to <3 x i8>
-  store <3 x i8> %ext, <3 x i8> addrspace(1)* %gep.out
-  ret void
-}
diff --git a/test/CodeGen/R600/setcc64.ll b/test/CodeGen/R600/setcc64.ll
deleted file mode 100644
index 231be7aa3da..00000000000
--- a/test/CodeGen/R600/setcc64.ll
+++ /dev/null
@@ -1,259 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
-
-; XXX: Merge this into setcc, once R600 supports 64-bit operations
-
-;;;==========================================================================;;;
-;; Double comparisons
-;;;==========================================================================;;;
-
-; FUNC-LABEL: {{^}}f64_oeq:
-; SI: v_cmp_eq_f64
-define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) {
-entry:
-  %0 = fcmp oeq double %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f64_ogt:
-; SI: v_cmp_gt_f64
-define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) {
-entry:
-  %0 = fcmp ogt double %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f64_oge:
-; SI: v_cmp_ge_f64
-define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) {
-entry:
-  %0 = fcmp oge double %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f64_olt:
-; SI: v_cmp_lt_f64
-define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) {
-entry:
-  %0 = fcmp olt double %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f64_ole:
-; SI: v_cmp_le_f64
-define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) {
-entry:
-  %0 = fcmp ole double %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f64_one:
-; SI: v_cmp_lg_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
-entry:
-  %0 = fcmp one double %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f64_ord:
-; SI: v_cmp_o_f64
-define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) {
-entry:
-  %0 = fcmp ord double %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f64_ueq:
-; SI: v_cmp_nlg_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
-entry:
-  %0 = fcmp ueq double %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f64_ugt:
-
-; SI: v_cmp_nle_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
-entry:
-  %0 = fcmp ugt double %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f64_uge:
-; SI: v_cmp_nlt_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
-entry:
-  %0 = fcmp uge double %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f64_ult:
-; SI: v_cmp_nge_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
-entry:
-  %0 = fcmp ult double %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f64_ule:
-; SI: v_cmp_ngt_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
-entry:
-  %0 = fcmp ule double %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f64_une:
-; SI: v_cmp_neq_f64
-define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) {
-entry:
-  %0 = fcmp une double %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}f64_uno:
-; SI: v_cmp_u_f64
-define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) {
-entry:
-  %0 = fcmp uno double %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-;;;==========================================================================;;;
-;; 64-bit integer comparisons
-;;;==========================================================================;;;
-
-; FUNC-LABEL: {{^}}i64_eq:
-; SI: v_cmp_eq_i64
-define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %0 = icmp eq i64 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i64_ne:
-; SI: v_cmp_ne_i64
-define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %0 = icmp ne i64 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i64_ugt:
-; SI: v_cmp_gt_u64
-define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %0 = icmp ugt i64 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i64_uge:
-; SI: v_cmp_ge_u64
-define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %0 = icmp uge i64 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i64_ult:
-; SI: v_cmp_lt_u64
-define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %0 = icmp ult i64 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i64_ule:
-; SI: v_cmp_le_u64
-define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %0 = icmp ule i64 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i64_sgt:
-; SI: v_cmp_gt_i64
-define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %0 = icmp sgt i64 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i64_sge:
-; SI: v_cmp_ge_i64
-define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %0 = icmp sge i64 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i64_slt:
-; SI: v_cmp_lt_i64
-define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %0 = icmp slt i64 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i64_sle:
-; SI: v_cmp_le_i64
-define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %0 = icmp sle i64 %a, %b
-  %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/seto.ll b/test/CodeGen/R600/seto.ll
deleted file mode 100644
index 9b5d6b5dbd6..00000000000
--- a/test/CodeGen/R600/seto.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK-LABEL: {{^}}main:
-; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
-; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
-define void @main(float %p) {
-main_body:
-  %c = fcmp oeq float %p, %p
-  %r = select i1 %c, float 1.000000e+00, float 0.000000e+00
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r)
-  ret void
-}
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/setuo.ll b/test/CodeGen/R600/setuo.ll
deleted file mode 100644
index 76346c4f624..00000000000
--- a/test/CodeGen/R600/setuo.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK-LABEL: {{^}}main:
-; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
-; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
-define void @main(float %p) {
-main_body:
-  %c = fcmp une float %p, %p
-  %r = select i1 %c, float 1.000000e+00, float 0.000000e+00
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r)
-  ret void
-}
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/sext-eliminate.ll b/test/CodeGen/R600/sext-eliminate.ll
deleted file mode 100644
index 7dc6eb87f6b..00000000000
--- a/test/CodeGen/R600/sext-eliminate.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_add:
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
-; EG: SUB_INT {{[* ]*}}[[RES]]
-; EG-NOT: BFE
-define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
-  %sext = sext i1 %a to i32
-  %res = add i32 %b, %sext
-  store i32 %res, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_sub:
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
-; EG: ADD_INT {{[* ]*}}[[RES]]
-; EG-NOT: BFE
-define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) {
-  %sext = sext i1 %a to i32
-  %res = sub i32 %b, %sext
-  store i32 %res, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll
deleted file mode 100644
index 5aedda2ce1a..00000000000
--- a/test/CodeGen/R600/sext-in-reg.ll
+++ /dev/null
@@ -1,611 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-
-; FUNC-LABEL: {{^}}sext_in_reg_i1_i32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
-; SI: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
-; SI: buffer_store_dword [[EXTRACT]],
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
-; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1
-; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
-  %shl = shl i32 %in, 31
-  %sext = ashr i32 %shl, 31
-  store i32 %sext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32:
-; SI: s_add_i32 [[VAL:s[0-9]+]],
-; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]]
-; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
-; SI: buffer_store_dword [[VEXTRACT]],
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
-; EG: ADD_INT
-; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
-; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %c = add i32 %a, %b ; add to prevent folding into extload
-  %shl = shl i32 %c, 24
-  %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i32:
-; SI: s_add_i32 [[VAL:s[0-9]+]],
-; SI: s_sext_i32_i16 [[EXTRACT:s[0-9]+]], [[VAL]]
-; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
-; SI: buffer_store_dword [[VEXTRACT]],
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
-; EG: ADD_INT
-; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
-; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %c = add i32 %a, %b ; add to prevent folding into extload
-  %shl = shl i32 %c, 16
-  %ashr = ashr i32 %shl, 16
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i32:
-; SI: s_add_i32 [[VAL:s[0-9]+]],
-; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]]
-; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
-; SI: buffer_store_dword [[VEXTRACT]],
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
-; EG: ADD_INT
-; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
-; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
-  %c = add <1 x i32> %a, %b ; add to prevent folding into extload
-  %shl = shl <1 x i32> %c, <i32 24>
-  %ashr = ashr <1 x i32> %shl, <i32 24>
-  store <1 x i32> %ashr, <1 x i32> addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i1_to_i64:
-; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
-; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x10000
-; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
-; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
-; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %c = shl i64 %a, %b
-  %shl = shl i64 %c, 63
-  %ashr = ashr i64 %shl, 63
-  store i64 %ashr, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i64:
-; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
-; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x80000
-; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
-; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
-; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
-; EG: LSHL
-; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
-; EG: ASHR [[RES_HI]]
-; EG-NOT: BFE_INT
-; EG: LSHR
-; EG: LSHR
-;; TODO Check address computation, using | with variables in {{}} does not work,
-;; also the _LO/_HI order might be different
-define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %c = shl i64 %a, %b
-  %shl = shl i64 %c, 56
-  %ashr = ashr i64 %shl, 56
-  store i64 %ashr, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i64:
-; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
-; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x100000
-; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
-; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
-; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
-; EG: LSHL
-; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
-; EG: ASHR [[RES_HI]]
-; EG-NOT: BFE_INT
-; EG: LSHR
-; EG: LSHR
-;; TODO Check address computation, using | with variables in {{}} does not work,
-;; also the _LO/_HI order might be different
-define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %c = shl i64 %a, %b
-  %shl = shl i64 %c, 48
-  %ashr = ashr i64 %shl, 48
-  store i64 %ashr, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i32_to_i64:
-; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
-; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x200000
-; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
-; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
-; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
-; EG-NOT: BFE_INT
-
-; EG: ASHR [[RES_HI]]
-
-; EG: LSHR
-; EG: LSHR
-;; TODO Check address computation, using | with variables in {{}} does not work,
-;; also the _LO/_HI order might be different
-define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %c = shl i64 %a, %b
-  %shl = shl i64 %c, 32
-  %ashr = ashr i64 %shl, 32
-  store i64 %ashr, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
-; XFUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i64:
-; XSI: s_bfe_i32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
-; XSI: s_ashr_i32 {{v[0-9]+}}, [[EXTRACT]], 31
-; XSI: buffer_store_dword
-; XEG: BFE_INT
-; XEG: ASHR
-; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) nounwind {
-;   %c = add <1 x i64> %a, %b
-;   %shl = shl <1 x i64> %c, <i64 56>
-;   %ashr = ashr <1 x i64> %shl, <i64 56>
-;   store <1 x i64> %ashr, <1 x i64> addrspace(1)* %out, align 8
-;   ret void
-; }
-
-; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64:
-; SI: buffer_load_dwordx2
-; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
-; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1
-; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x()
-  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
-  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
-  %a = load i64, i64 addrspace(1)* %a.gep, align 8
-  %b = load i64, i64 addrspace(1)* %b.gep, align 8
-
-  %c = shl i64 %a, %b
-  %shl = shl i64 %c, 63
-  %ashr = ashr i64 %shl, 63
-  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_sext_in_reg_i8_to_i64:
-; SI: buffer_load_dwordx2
-; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
-; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8
-; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x()
-  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
-  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
-  %a = load i64, i64 addrspace(1)* %a.gep, align 8
-  %b = load i64, i64 addrspace(1)* %b.gep, align 8
-
-  %c = shl i64 %a, %b
-  %shl = shl i64 %c, 56
-  %ashr = ashr i64 %shl, 56
-  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_sext_in_reg_i16_to_i64:
-; SI: buffer_load_dwordx2
-; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
-; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16
-; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x()
-  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
-  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
-  %a = load i64, i64 addrspace(1)* %a.gep, align 8
-  %b = load i64, i64 addrspace(1)* %b.gep, align 8
-
-  %c = shl i64 %a, %b
-  %shl = shl i64 %c, 48
-  %ashr = ashr i64 %shl, 48
-  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64:
-; SI: buffer_load_dwordx2
-; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
-; SI: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[SHR]]{{\]}}
-define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x()
-  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
-  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
-  %a = load i64, i64 addrspace(1)* %a.gep, align 8
-  %b = load i64, i64 addrspace(1)* %b.gep, align 8
-
-  %c = shl i64 %a, %b
-  %shl = shl i64 %c, 32
-  %ashr = ashr i64 %shl, 32
-  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i1_in_i32_other_amount:
-; SI-NOT: s_lshl
-; SI-NOT: s_ashr
-; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
-; EG-NOT: BFE
-; EG: ADD_INT
-; EG: LSHL
-; EG: ASHR [[RES]]
-; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %c = add i32 %a, %b
-  %x = shl i32 %c, 6
-  %y = ashr i32 %x, 7
-  store i32 %y, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount:
-; SI-NOT: s_lshl
-; SI-NOT: s_ashr
-; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
-; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
-; SI: s_endpgm
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
-; EG-NOT: BFE
-; EG: ADD_INT
-; EG: LSHL
-; EG: ASHR [[RES]]
-; EG: LSHL
-; EG: ASHR [[RES]]
-; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
-  %c = add <2 x i32> %a, %b
-  %x = shl <2 x i32> %c, <i32 6, i32 6>
-  %y = ashr <2 x i32> %x, <i32 7, i32 7>
-  store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2
-  ret void
-}
-
-
-; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i32:
-; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: buffer_store_dwordx2
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
-; EG: BFE_INT [[RES]]
-; EG: BFE_INT [[RES]]
-; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
-  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
-  %shl = shl <2 x i32> %c, <i32 31, i32 31>
-  %ashr = ashr <2 x i32> %shl, <i32 31, i32 31>
-  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_v4i1_to_v4i32:
-; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: buffer_store_dwordx4
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
-; EG: BFE_INT [[RES]]
-; EG: BFE_INT [[RES]]
-; EG: BFE_INT [[RES]]
-; EG: BFE_INT [[RES]]
-; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
-  %c = add <4 x i32> %a, %b ; add to prevent folding into extload
-  %shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
-  %ashr = ashr <4 x i32> %shl, <i32 31, i32 31, i32 31, i32 31>
-  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i32:
-; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: buffer_store_dwordx2
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
-; EG: BFE_INT [[RES]]
-; EG: BFE_INT [[RES]]
-; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
-  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
-  %shl = shl <2 x i32> %c, <i32 24, i32 24>
-  %ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
-  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_v4i8_to_v4i32:
-; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: buffer_store_dwordx4
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
-; EG: BFE_INT [[RES]]
-; EG: BFE_INT [[RES]]
-; EG: BFE_INT [[RES]]
-; EG: BFE_INT [[RES]]
-; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
-  %c = add <4 x i32> %a, %b ; add to prevent folding into extload
-  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
-  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
-  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_v2i16_to_v2i32:
-; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: buffer_store_dwordx2
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
-; EG: BFE_INT [[RES]]
-; EG: BFE_INT [[RES]]
-; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
-  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
-  %shl = shl <2 x i32> %c, <i32 16, i32 16>
-  %ashr = ashr <2 x i32> %shl, <i32 16, i32 16>
-  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}testcase:
-define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind {
-  %and_a_1 = and i8 %a, 1
-  %cmp_eq = icmp eq i8 %and_a_1, 0
-  %cmp_slt = icmp slt i8 %a, 0
-  %sel0 = select i1 %cmp_slt, i8 0, i8 %a
-  %sel1 = select i1 %cmp_eq, i8 0, i8 %a
-  %xor = xor i8 %sel0, %sel1
-  store i8 %xor, i8 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}testcase_3:
-define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind {
-  %and_a_1 = and i8 %a, 1
-  %cmp_eq = icmp eq i8 %and_a_1, 0
-  %cmp_slt = icmp slt i8 %a, 0
-  %sel0 = select i1 %cmp_slt, i8 0, i8 %a
-  %sel1 = select i1 %cmp_eq, i8 0, i8 %a
-  %xor = xor i8 %sel0, %sel1
-  store i8 %xor, i8 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i8_to_v4i32:
-; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
-; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
-; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
-; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
-define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
-  %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
-  %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
-  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
-  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
-  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
-  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32:
-; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
-; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
-define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
-  %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
-  %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
-  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
-  %shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
-  %ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16>
-  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_to_illegal_type:
-; SI: buffer_load_sbyte
-; SI: v_max_i32
-; SI-NOT: bfe
-; SI: buffer_store_short
-define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
-  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
-  %tmp2 = sext i8 %tmp5 to i32
-  %tmp3 = tail call i32 @llvm.AMDGPU.imax(i32 %tmp2, i32 0) nounwind readnone
-  %tmp4 = trunc i32 %tmp3 to i8
-  %tmp6 = sext i8 %tmp4 to i16
-  store i16 %tmp6, i16 addrspace(1)* %out, align 2
-  ret void
-}
-
-declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfe_0_width:
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_8_bfe_8:
-; SI: v_bfe_i32
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
-  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
-  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
-  store i32 %bfe1, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_8_bfe_16:
-; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
-; SI: s_endpgm
-define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
-  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
-  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone
-  store i32 %bfe1, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; This really should be folded into 1
-; FUNC-LABEL: {{^}}bfe_16_bfe_8:
-; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
-  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone
-  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
-  store i32 %bfe1, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; Make sure there isn't a redundant BFE
-; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe:
-; SI: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %c = add i32 %a, %b ; add to prevent folding into extload
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone
-  %shl = shl i32 %bfe, 24
-  %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong:
-define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %c = add i32 %a, %b ; add to prevent folding into extload
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone
-  %shl = shl i32 %bfe, 24
-  %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe:
-; SI: buffer_load_sbyte
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
-  %load = load i8, i8 addrspace(1)* %ptr, align 1
-  %sext = sext i8 %load to i32
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone
-  %shl = shl i32 %bfe, 24
-  %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI: .text
-; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
-  %load = load i8, i8 addrspace(1)* %ptr, align 1
-  %sext = sext i8 %load to i32
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone
-  %shl = shl i32 %bfe, 24
-  %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0:
-; SI-NOT: shr
-; SI-NOT: shl
-; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
-; SI: s_endpgm
-define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %shr = ashr i32 %shl, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1:
-; SI: buffer_load_dword
-; SI-NOT: shl
-; SI-NOT: shr
-; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
-; SI: s_endpgm
-define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 30
-  %shr = ashr i32 %shl, 30
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1:
-; SI: buffer_load_dword
-; SI-NOT: v_lshl
-; SI-NOT: v_ashr
-; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2
-; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
-; SI: s_endpgm
-define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 30
-  %shr = ashr i32 %shl, 30
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/sgpr-control-flow.ll b/test/CodeGen/R600/sgpr-control-flow.ll
deleted file mode 100644
index 38289ced632..00000000000
--- a/test/CodeGen/R600/sgpr-control-flow.ll
+++ /dev/null
@@ -1,105 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-;
-;
-; Most SALU instructions ignore control flow, so we need to make sure
-; they don't overwrite values from other blocks.
-
-; If the branch decision is made based on a value in an SGPR then all
-; threads will execute the same code paths, so we don't need to worry
-; about instructions in different blocks overwriting each other.
-; SI-LABEL: {{^}}sgpr_if_else_salu_br:
-; SI: s_add
-; SI: s_add
-
-define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
-entry:
-  %0 = icmp eq i32 %a, 0
-  br i1 %0, label %if, label %else
-
-if:
-  %1 = add i32 %b, %c
-  br label %endif
-
-else:
-  %2 = add i32 %d, %e
-  br label %endif
-
-endif:
-  %3 = phi i32 [%1, %if], [%2, %else]
-  %4 = add i32 %3, %a
-  store i32 %4, i32 addrspace(1)* %out
-  ret void
-}
-
-; The two S_ADD instructions should write to different registers, since
-; different threads will take different control flow paths.
-
-; SI-LABEL: {{^}}sgpr_if_else_valu_br:
-; SI: s_add_i32 [[SGPR:s[0-9]+]]
-; SI-NOT: s_add_i32 [[SGPR]]
-
-define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
-entry:
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %tid_f = uitofp i32 %tid to float
-  %tmp1 = fcmp ueq float %tid_f, 0.0
-  br i1 %tmp1, label %if, label %else
-
-if:
-  %tmp2 = add i32 %b, %c
-  br label %endif
-
-else:
-  %tmp3 = add i32 %d, %e
-  br label %endif
-
-endif:
-  %tmp4 = phi i32 [%tmp2, %if], [%tmp3, %else]
-  store i32 %tmp4, i32 addrspace(1)* %out
-  ret void
-}
-
-; FIXME: Should write to different SGPR pairs instead of copying to
-; VALU for i1 phi.
-
-; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:
-; SI: buffer_load_dword [[AVAL:v[0-9]+]]
-; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
-; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
-
-; SI: BB2_1:
-; SI: buffer_load_dword [[AVAL:v[0-9]+]]
-; SI: v_cmp_eq_i32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
-; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
-
-; SI: v_cmp_ne_i32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
-; SI: buffer_store_dword [[RESULT]]
-define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
-entry:
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
-  %tmp1 = icmp eq i32 %tid, 0
-  br i1 %tmp1, label %if, label %else
-
-if:
-  %gep.if = getelementptr i32, i32 addrspace(1)* %a, i32 %tid
-  %a.val = load i32, i32 addrspace(1)* %gep.if
-  %cmp.if = icmp eq i32 %a.val, 0
-  br label %endif
-
-else:
-  %gep.else = getelementptr i32, i32 addrspace(1)* %b, i32 %tid
-  %b.val = load i32, i32 addrspace(1)* %gep.else
-  %cmp.else = icmp slt i32 %b.val, 0
-  br label %endif
-
-endif:
-  %tmp4 = phi i1 [%cmp.if, %if], [%cmp.else, %else]
-  %ext = sext i1 %tmp4 to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-declare i32 @llvm.r600.read.tidig.x() #0
-
-attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
deleted file mode 100644
index df67fcca22f..00000000000
--- a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-
-; Copy VGPR -> SGPR used twice as an instruction operand, which is then
-; used in an REG_SEQUENCE that also needs to be handled.
-
-; SI-LABEL: {{^}}test_dup_operands:
-; SI: v_add_i32_e32
-define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
-  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
-  %lo = extractelement <2 x i32> %a, i32 0
-  %hi = extractelement <2 x i32> %a, i32 1
-  %add = add i32 %lo, %lo
-  %vec0 = insertelement <2 x i32> undef, i32 %add, i32 0
-  %vec1 = insertelement <2 x i32> %vec0, i32 %hi, i32 1
-  store <2 x i32> %vec1, <2 x i32> addrspace(1)* %out, align 8
-  ret void
-}
-
diff --git a/test/CodeGen/R600/sgpr-copy.ll b/test/CodeGen/R600/sgpr-copy.ll
deleted file mode 100644
index b849c4038bc..00000000000
--- a/test/CodeGen/R600/sgpr-copy.ll
+++ /dev/null
@@ -1,379 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; This test checks that no VGPR to SGPR copies are created by the register
-; allocator.
-; CHECK-LABEL: {{^}}phi1:
-; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
-; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]]
-
-define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
-  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
-  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
-  %25 = fptosi float %23 to i32
-  %26 = icmp ne i32 %25, 0
-  br i1 %26, label %ENDIF, label %ELSE
-
-ELSE:                                             ; preds = %main_body
-  %27 = fsub float -0.000000e+00, %22
-  br label %ENDIF
-
-ENDIF:                                            ; preds = %main_body, %ELSE
-  %temp.0 = phi float [ %27, %ELSE ], [ %22, %main_body ]
-  %28 = fadd float %temp.0, %24
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %28, float %28, float 0.000000e+00, float 1.000000e+00)
-  ret void
-}
-
-; Make sure this program doesn't crash
-; CHECK-LABEL: {{^}}phi2:
-define void @phi2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
-  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
-  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 36)
-  %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 40)
-  %26 = call float @llvm.SI.load.const(<16 x i8> %21, i32 48)
-  %27 = call float @llvm.SI.load.const(<16 x i8> %21, i32 52)
-  %28 = call float @llvm.SI.load.const(<16 x i8> %21, i32 56)
-  %29 = call float @llvm.SI.load.const(<16 x i8> %21, i32 64)
-  %30 = call float @llvm.SI.load.const(<16 x i8> %21, i32 68)
-  %31 = call float @llvm.SI.load.const(<16 x i8> %21, i32 72)
-  %32 = call float @llvm.SI.load.const(<16 x i8> %21, i32 76)
-  %33 = call float @llvm.SI.load.const(<16 x i8> %21, i32 80)
-  %34 = call float @llvm.SI.load.const(<16 x i8> %21, i32 84)
-  %35 = call float @llvm.SI.load.const(<16 x i8> %21, i32 88)
-  %36 = call float @llvm.SI.load.const(<16 x i8> %21, i32 92)
-  %37 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
-  %38 = load <32 x i8>, <32 x i8> addrspace(2)* %37, !tbaa !1
-  %39 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0
-  %40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, !tbaa !1
-  %41 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
-  %42 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
-  %43 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %3, <2 x i32> %5)
-  %44 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %3, <2 x i32> %5)
-  %45 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %3, <2 x i32> %5)
-  %46 = bitcast float %41 to i32
-  %47 = bitcast float %42 to i32
-  %48 = insertelement <2 x i32> undef, i32 %46, i32 0
-  %49 = insertelement <2 x i32> %48, i32 %47, i32 1
-  %50 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %49, <32 x i8> %38, <16 x i8> %40, i32 2)
-  %51 = extractelement <4 x float> %50, i32 2
-  %52 = call float @fabs(float %51)
-  %53 = fmul float %43, %43
-  %54 = fmul float %44, %44
-  %55 = fadd float %54, %53
-  %56 = fmul float %45, %45
-  %57 = fadd float %55, %56
-  %58 = call float @llvm.AMDGPU.rsq.f32(float %57)
-  %59 = fmul float %43, %58
-  %60 = fmul float %44, %58
-  %61 = fmul float %45, %58
-  %62 = fmul float %59, %23
-  %63 = fmul float %60, %24
-  %64 = fadd float %63, %62
-  %65 = fmul float %61, %25
-  %66 = fadd float %64, %65
-  %67 = fsub float -0.000000e+00, %26
-  %68 = fmul float %66, %52
-  %69 = fadd float %68, %67
-  %70 = fmul float %27, %69
-  %71 = fmul float %28, %69
-  %72 = call float @fabs(float %70)
-  %73 = fcmp olt float 0x3EE4F8B580000000, %72
-  %74 = sext i1 %73 to i32
-  %75 = bitcast i32 %74 to float
-  %76 = bitcast float %75 to i32
-  %77 = icmp ne i32 %76, 0
-  br i1 %77, label %IF, label %ENDIF
-
-IF:                                               ; preds = %main_body
-  %78 = fsub float -0.000000e+00, %70
-  %79 = call float @llvm.AMDIL.exp.(float %78)
-  %80 = fsub float -0.000000e+00, %79
-  %81 = fadd float 1.000000e+00, %80
-  %82 = fdiv float 1.000000e+00, %70
-  %83 = fmul float %81, %82
-  %84 = fmul float %32, %83
-  br label %ENDIF
-
-ENDIF:                                            ; preds = %main_body, %IF
-  %temp4.0 = phi float [ %84, %IF ], [ %32, %main_body ]
-  %85 = call float @fabs(float %71)
-  %86 = fcmp olt float 0x3EE4F8B580000000, %85
-  %87 = sext i1 %86 to i32
-  %88 = bitcast i32 %87 to float
-  %89 = bitcast float %88 to i32
-  %90 = icmp ne i32 %89, 0
-  br i1 %90, label %IF25, label %ENDIF24
-
-IF25:                                             ; preds = %ENDIF
-  %91 = fsub float -0.000000e+00, %71
-  %92 = call float @llvm.AMDIL.exp.(float %91)
-  %93 = fsub float -0.000000e+00, %92
-  %94 = fadd float 1.000000e+00, %93
-  %95 = fdiv float 1.000000e+00, %71
-  %96 = fmul float %94, %95
-  %97 = fmul float %36, %96
-  br label %ENDIF24
-
-ENDIF24:                                          ; preds = %ENDIF, %IF25
-  %temp8.0 = phi float [ %97, %IF25 ], [ %36, %ENDIF ]
-  %98 = fmul float %29, %temp4.0
-  %99 = fmul float %30, %temp4.0
-  %100 = fmul float %31, %temp4.0
-  %101 = fmul float %33, %temp8.0
-  %102 = fadd float %101, %98
-  %103 = fmul float %34, %temp8.0
-  %104 = fadd float %103, %99
-  %105 = fmul float %35, %temp8.0
-  %106 = fadd float %105, %100
-  %107 = call float @llvm.pow.f32(float %52, float %22)
-  %108 = fsub float -0.000000e+00, %102
-  %109 = fmul float %108, %107
-  %110 = fsub float -0.000000e+00, %104
-  %111 = fmul float %110, %107
-  %112 = fsub float -0.000000e+00, %106
-  %113 = fmul float %112, %107
-  %114 = call i32 @llvm.SI.packf16(float %109, float %111)
-  %115 = bitcast i32 %114 to float
-  %116 = call i32 @llvm.SI.packf16(float %113, float 1.000000e+00)
-  %117 = bitcast i32 %116 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %115, float %117, float %115, float %117)
-  ret void
-}
-
-; We just want ot make sure the program doesn't crash
-; CHECK-LABEL: {{^}}loop:
-
-define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
-  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 4)
-  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 8)
-  %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 12)
-  %26 = fptosi float %25 to i32
-  %27 = bitcast i32 %26 to float
-  %28 = bitcast float %27 to i32
-  br label %LOOP
-
-LOOP:                                             ; preds = %ENDIF, %main_body
-  %temp4.0 = phi float [ %22, %main_body ], [ %temp5.0, %ENDIF ]
-  %temp5.0 = phi float [ %23, %main_body ], [ %temp6.0, %ENDIF ]
-  %temp6.0 = phi float [ %24, %main_body ], [ %temp4.0, %ENDIF ]
-  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %37, %ENDIF ]
-  %29 = bitcast float %temp8.0 to i32
-  %30 = icmp sge i32 %29, %28
-  %31 = sext i1 %30 to i32
-  %32 = bitcast i32 %31 to float
-  %33 = bitcast float %32 to i32
-  %34 = icmp ne i32 %33, 0
-  br i1 %34, label %IF, label %ENDIF
-
-IF:                                               ; preds = %LOOP
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00)
-  ret void
-
-ENDIF:                                            ; preds = %LOOP
-  %35 = bitcast float %temp8.0 to i32
-  %36 = add i32 %35, 1
-  %37 = bitcast i32 %36 to float
-  br label %LOOP
-}
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: readonly
-declare float @fabs(float) #2
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readonly }
-attributes #3 = { readnone }
-attributes #4 = { nounwind readonly }
-
-!0 = !{!"const", null}
-!1 = !{!0, !0, i64 0, i32 1}
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq.f32(float) #3
-
-; Function Attrs: readnone
-declare float @llvm.AMDIL.exp.(float) #3
-
-; Function Attrs: nounwind readonly
-declare float @llvm.pow.f32(float, float) #4
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
-
-; This checks for a bug in the FixSGPRCopies pass where VReg96
-; registers were being identified as an SGPR regclass which was causing
-; an assertion failure.
-
-; CHECK-LABEL: {{^}}sample_v3:
-; CHECK: image_sample
-; CHECK: image_sample
-; CHECK: exp
-; CHECK: s_endpgm
-define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-
-entry:
-  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
-  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !2
-  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 16)
-  %24 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
-  %25 = load <32 x i8>, <32 x i8> addrspace(2)* %24, !tbaa !2
-  %26 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
-  %27 = load <16 x i8>, <16 x i8> addrspace(2)* %26, !tbaa !2
-  %28 = fcmp oeq float %23, 0.0
-  br i1 %28, label %if, label %else
-
-if:
-  %val.if = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> <i32 0, i32 0>, <32 x i8> %25, <16 x i8> %27, i32 2)
-  %val.if.0 = extractelement <4 x float> %val.if, i32 0
-  %val.if.1 = extractelement <4 x float> %val.if, i32 1
-  %val.if.2 = extractelement <4 x float> %val.if, i32 2
-  br label %endif
-
-else:
-  %val.else = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> <i32 1, i32 0>, <32 x i8> %25, <16 x i8> %27, i32 2)
-  %val.else.0 = extractelement <4 x float> %val.else, i32 0
-  %val.else.1 = extractelement <4 x float> %val.else, i32 1
-  %val.else.2 = extractelement <4 x float> %val.else, i32 2
-  br label %endif
-
-endif:
-  %val.0 = phi float [%val.if.0, %if], [%val.else.0, %else]
-  %val.1 = phi float [%val.if.1, %if], [%val.else.1, %else]
-  %val.2 = phi float [%val.if.2, %if], [%val.else.2, %else]
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.0)
-  ret void
-}
-
-!2 = !{!"const", null, i32 1}
-
-; CHECK-LABEL: {{^}}copy1:
-; CHECK: buffer_load_dword
-; CHECK: v_add
-; CHECK: s_endpgm
-define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
-entry:
-  %0 = load float, float addrspace(1)* %in0
-  %1 = fcmp oeq float %0, 0.0
-  br i1 %1, label %if0, label %endif
-
-if0:
-  %2 = bitcast float %0 to i32
-  %3 = fcmp olt float %0, 0.0
-  br i1 %3, label %if1, label %endif
-
-if1:
-  %4 = add i32 %2, 1
-  br label %endif
-
-endif:
-  %5 = phi i32 [ 0, %entry ], [ %2, %if0 ], [ %4, %if1 ]
-  %6 = bitcast i32 %5 to float
-  store float %6, float addrspace(1)* %out
-  ret void
-}
-
-; This test is just checking that we don't crash / assertion fail.
-; CHECK-LABEL: {{^}}copy2:
-; CHECK: s_endpgm
-
-define void @copy2([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-entry:
-  br label %LOOP68
-
-LOOP68:
-  %temp4.7 = phi float [ 0.000000e+00, %entry ], [ %v, %ENDIF69 ]
-  %t = phi i32 [ 20, %entry ], [ %x, %ENDIF69 ]
-  %g = icmp eq i32 0, %t
-  %l = bitcast float %temp4.7 to i32
-  br i1 %g, label %IF70, label %ENDIF69
-
-IF70:
-  %q = icmp ne i32 %l, 13
-  %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  ret void
-
-ENDIF69:
-  %u = add i32 %l, %t
-  %v = bitcast i32 %u to float
-  %x = add i32 %t, -1
-  br label %LOOP68
-}
-
-attributes #0 = { "ShaderType"="0" }
-
-; This test checks that image_sample resource descriptors aren't loaded into
-; vgprs.  The verifier will fail if this happens.
-; CHECK-LABEL:{{^}}sample_rsrc:
-; CHECK: image_sample
-; CHECK: image_sample
-; CHECK: s_endpgm
-define void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
-bb:
-  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0
-  %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
-  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16)
-  %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0
-  %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !0
-  %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0
-  %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !0
-  %tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7)
-  %tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7)
-  %tmp31 = bitcast float %tmp23 to i32
-  %tmp36 = icmp ne i32 %tmp31, 0
-  br i1 %tmp36, label %bb38, label %bb80
-
-bb38:                                             ; preds = %bb
-  %tmp52 = bitcast float %tmp29 to i32
-  %tmp53 = bitcast float %tmp30 to i32
-  %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0
-  %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1
-  %tmp56 = bitcast <8 x i32> %tmp26 to <32 x i8>
-  %tmp57 = bitcast <4 x i32> %tmp28 to <16 x i8>
-  %tmp58 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp55, <32 x i8> %tmp56, <16 x i8> %tmp57, i32 2)
-  br label %bb71
-
-bb80:                                             ; preds = %bb
-  %tmp81 = bitcast float %tmp29 to i32
-  %tmp82 = bitcast float %tmp30 to i32
-  %tmp82.2 = add i32 %tmp82, 1
-  %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0
-  %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1
-  %tmp85 = bitcast <8 x i32> %tmp26 to <32 x i8>
-  %tmp86 = bitcast <4 x i32> %tmp28 to <16 x i8>
-  %tmp87 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp84, <32 x i8> %tmp85, <16 x i8> %tmp86, i32 2)
-  br label %bb71
-
-bb71:                                             ; preds = %bb80, %bb38
-  %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ]
-  %tmp88 = extractelement <4 x float> %tmp72, i32 0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp88, float %tmp88, float %tmp88, float %tmp88)
-  ret void
-}
-
-attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/shared-op-cycle.ll b/test/CodeGen/R600/shared-op-cycle.ll
deleted file mode 100644
index f52a9baf4d1..00000000000
--- a/test/CodeGen/R600/shared-op-cycle.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; CHECK: {{^}}main:
-; CHECK: MULADD_IEEE *
-; CHECK-NOT: MULADD_IEEE *
-
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
-   %w0 = extractelement <4 x float> %reg0, i32 3
-   %w1 = extractelement <4 x float> %reg1, i32 3
-   %w2 = extractelement <4 x float> %reg2, i32 3
-   %sq0 = fmul float %w0, %w0
-   %r0 = fadd float %sq0, 2.0
-   %sq1 = fmul float %w1, %w1
-   %r1 = fadd float %sq1, 2.0
-   %sq2 = fmul float %w2, %w2
-   %r2 = fadd float %sq2, 2.0
-   %v0 = insertelement <4 x float> undef, float %r0, i32 0
-   %v1 = insertelement <4 x float> %v0, float %r1, i32 1
-   %v2 = insertelement <4 x float> %v1, float %r2, i32 2
-   %res = call float @llvm.AMDGPU.dp4(<4 x float> %v2, <4 x float> %v2)
-   %vecres = insertelement <4 x float> undef, float %res, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vecres, i32 0, i32 2)
-   ret void
-}
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
\ No newline at end of file
diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll
deleted file mode 100644
index 53b63dc4b8a..00000000000
--- a/test/CodeGen/R600/shl.ll
+++ /dev/null
@@ -1,180 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s
-
-;EG: {{^}}shl_v2i32:
-;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-;SI: {{^}}shl_v2i32:
-;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-;VI: {{^}}shl_v2i32:
-;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
-  %result = shl <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-;EG: {{^}}shl_v4i32:
-;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-;SI: {{^}}shl_v4i32:
-;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-;VI: {{^}}shl_v4i32:
-;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
-  %result = shl <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-;EG: {{^}}shl_i64:
-;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
-;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
-;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}}
-;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
-
-;SI: {{^}}shl_i64:
-;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
-;VI: {{^}}shl_i64:
-;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-
-define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
-  %a = load i64, i64 addrspace(1) * %in
-  %b = load i64, i64 addrspace(1) * %b_ptr
-  %result = shl i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-;EG: {{^}}shl_v2i64:
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-;EG-DAG: LSHR {{\*? *}}[[COMPSHA]]
-;EG-DAG: LSHR {{\*? *}}[[COMPSHB]]
-;EG-DAG: LSHR {{.*}}, 1
-;EG-DAG: LSHR {{.*}}, 1
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: LSHL {{.*}}, [[SHA]]
-;EG-DAG: LSHL {{.*}}, [[SHB]]
-;EG-DAG: LSHL {{.*}}, [[SHA]]
-;EG-DAG: LSHL {{.*}}, [[SHB]]
-;EG-DAG: LSHL
-;EG-DAG: LSHL
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-;EG-DAG: CNDE_INT {{.*}}, 0.0
-;EG-DAG: CNDE_INT {{.*}}, 0.0
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-
-;SI: {{^}}shl_v2i64:
-;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
-;VI: {{^}}shl_v2i64:
-;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-
-define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
-  %a = load <2 x i64>, <2 x i64> addrspace(1) * %in
-  %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr
-  %result = shl <2 x i64> %a, %b
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-;EG: {{^}}shl_v4i64:
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
-;EG-DAG: LSHR {{\*? *}}[[COMPSHA]]
-;EG-DAG: LSHR {{\*? *}}[[COMPSHB]]
-;EG-DAG: LSHR {{\*? *}}[[COMPSHC]]
-;EG-DAG: LSHR {{\*? *}}[[COMPSHD]]
-;EG-DAG: LSHR {{.*}}, 1
-;EG-DAG: LSHR {{.*}}, 1
-;EG-DAG: LSHR {{.*}}, 1
-;EG-DAG: LSHR {{.*}}, 1
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: LSHL {{.*}}, [[SHA]]
-;EG-DAG: LSHL {{.*}}, [[SHB]]
-;EG-DAG: LSHL {{.*}}, [[SHC]]
-;EG-DAG: LSHL {{.*}}, [[SHD]]
-;EG-DAG: LSHL {{.*}}, [[SHA]]
-;EG-DAG: LSHL {{.*}}, [[SHB]]
-;EG-DAG: LSHL {{.*}}, [[SHC]]
-;EG-DAG: LSHL {{.*}}, [[SHD]]
-;EG-DAG: LSHL
-;EG-DAG: LSHL
-;EG-DAG: LSHL
-;EG-DAG: LSHL
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
-;EG-DAG: CNDE_INT {{.*}}, 0.0
-;EG-DAG: CNDE_INT {{.*}}, 0.0
-;EG-DAG: CNDE_INT {{.*}}, 0.0
-;EG-DAG: CNDE_INT {{.*}}, 0.0
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-
-;SI: {{^}}shl_v4i64:
-;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
-;VI: {{^}}shl_v4i64:
-;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-
-define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
-  %a = load <4 x i64>, <4 x i64> addrspace(1) * %in
-  %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr
-  %result = shl <4 x i64> %a, %b
-  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/shl_add_constant.ll b/test/CodeGen/R600/shl_add_constant.ll
deleted file mode 100644
index b1485bfaaeb..00000000000
--- a/test/CodeGen/R600/shl_add_constant.ll
+++ /dev/null
@@ -1,90 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; Test with inline immediate
-
-; FUNC-LABEL: {{^}}shl_2_add_9_i32:
-; SI: v_lshlrev_b32_e32  [[REG:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 36, [[REG]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
-  %val = load i32, i32 addrspace(1)* %ptr, align 4
-  %add = add i32 %val, 9
-  %result = shl i32 %add, 2
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}shl_2_add_9_i32_2_add_uses:
-; SI-DAG: v_add_i32_e32 [[ADDREG:v[0-9]+]], 9, {{v[0-9]+}}
-; SI-DAG: v_lshlrev_b32_e32 [[SHLREG:v[0-9]+]], 2, {{v[0-9]+}}
-; SI-DAG: buffer_store_dword [[ADDREG]]
-; SI-DAG: buffer_store_dword [[SHLREG]]
-; SI: s_endpgm
-define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
-  %val = load i32, i32 addrspace(1)* %ptr, align 4
-  %add = add i32 %val, 9
-  %result = shl i32 %add, 2
-  store i32 %result, i32 addrspace(1)* %out0, align 4
-  store i32 %add, i32 addrspace(1)* %out1, align 4
-  ret void
-}
-
-; Test with add literal constant
-
-; FUNC-LABEL: {{^}}shl_2_add_999_i32:
-; SI: v_lshlrev_b32_e32  [[REG:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 0xf9c, [[REG]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
-  %val = load i32, i32 addrspace(1)* %ptr, align 4
-  %shl = add i32 %val, 999
-  %result = shl i32 %shl, 2
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_add_shl_add_constant:
-; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
-; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]]
-; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
-; SI: buffer_store_dword [[VRESULT]]
-define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
-  %add.0 = add i32 %x, 123
-  %shl = shl i32 %add.0, 3
-  %add.1 = add i32 %shl, %y
-   store i32 %add.1, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_add_shl_add_constant_inv:
-; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
-; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]]
-; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
-; SI: buffer_store_dword [[VRESULT]]
-
-define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
-  %add.0 = add i32 %x, 123
-  %shl = shl i32 %add.0, 3
-  %add.1 = add i32 %y, %shl
-  store i32 %add.1, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/shl_add_ptr.ll b/test/CodeGen/R600/shl_add_ptr.ll
deleted file mode 100644
index 6671e909cd1..00000000000
--- a/test/CodeGen/R600/shl_add_ptr.ll
+++ /dev/null
@@ -1,284 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
-
-; Test that doing a shift of a pointer with a constant add will be
-; folded into the constant offset addressing mode even if the add has
-; multiple uses. This is relevant to accessing 2 separate, adjacent
-; LDS globals.
-
-
-declare i32 @llvm.r600.read.tidig.x() #1
-
-@lds0 = addrspace(3) global [512 x float] undef, align 4
-@lds1 = addrspace(3) global [512 x float] undef, align 4
-
-
-; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8
-
-; SI-LABEL: {{^}}load_shl_base_lds_0:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
-; SI: s_endpgm
-define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-  store float %val0, float addrspace(1)* %out
-  ret void
-}
-
-; Make sure once the first use is folded into the addressing mode, the
-; remaining add use goes through the normal shl + add constant fold.
-
-; SI-LABEL: {{^}}load_shl_base_lds_1:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
-; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], 8, v{{[0-9]+}}
-; SI-DAG: buffer_store_dword [[RESULT]]
-; SI-DAG: buffer_store_dword [[ADDUSE]]
-; SI: s_endpgm
-define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  %shl_add_use = shl i32 %idx.0, 2
-  store i32 %shl_add_use, i32 addrspace(1)* %add_use, align 4
-  store float %val0, float addrspace(1)* %out
-  ret void
-}
-
-@maxlds = addrspace(3) global [65536 x i8] undef, align 4
-
-; SI-LABEL: {{^}}load_shl_base_lds_max_offset
-; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
-; SI: s_endpgm
-define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 65535
-  %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
-  %val0 = load i8, i8 addrspace(3)* %arrayidx0
-  store i32 %idx.0, i32 addrspace(1)* %add_use
-  store i8 %val0, i8 addrspace(1)* %out
-  ret void
-}
-
-; The two globals are placed adjacent in memory, so the same base
-; pointer can be used with an offset into the second one.
-
-; SI-LABEL: {{^}}load_shl_base_lds_2:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: s_mov_b32 m0, -1
-; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
-; SI: s_endpgm
-define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 64
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  %sum = fadd float %val0, %val1
-  store float %sum, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}store_shl_base_lds_0:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
-; SI: s_endpgm
-define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  store float 1.0, float addrspace(3)* %arrayidx0, align 4
-  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-  ret void
-}
-
-
-; --------------------------------------------------------------------------------
-; Atomics.
-
-@lds2 = addrspace(3) global [512 x i32] undef, align 4
-
-; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-;   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-;   %idx.0 = add nsw i32 %tid.x, 2
-;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
-;   %val = load atomic i32, i32 addrspace(3)* %arrayidx0 seq_cst, align 4
-;   store i32 %val, i32 addrspace(1)* %out, align 4
-;   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-;   ret void
-; }
-
-
-; SI-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
-; SI: s_endpgm
-define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
-  %pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic
-  %result = extractvalue { i32, i1 } %pair, 0
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}atomic_swap_shl_base_lds_0:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-; SI: s_endpgm
-define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
-  %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}atomic_add_shl_base_lds_0:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-; SI: s_endpgm
-define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
-  %val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}atomic_sub_shl_base_lds_0:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-; SI: s_endpgm
-define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
-  %val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}atomic_and_shl_base_lds_0:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-; SI: s_endpgm
-define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
-  %val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}atomic_or_shl_base_lds_0:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-; SI: s_endpgm
-define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
-  %val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}atomic_xor_shl_base_lds_0:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-; SI: s_endpgm
-define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
-  %val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-  ret void
-}
-
-; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-;   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-;   %idx.0 = add nsw i32 %tid.x, 2
-;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
-;   %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
-;   store i32 %val, i32 addrspace(1)* %out, align 4
-;   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-;   ret void
-; }
-
-; SI-LABEL: {{^}}atomic_min_shl_base_lds_0:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-; SI: s_endpgm
-define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
-  %val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}atomic_max_shl_base_lds_0:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-; SI: s_endpgm
-define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
-  %val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}atomic_umin_shl_base_lds_0:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-; SI: s_endpgm
-define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
-  %val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}atomic_umax_shl_base_lds_0:
-; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-; SI: s_endpgm
-define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
-  %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/si-annotate-cf-assertion.ll b/test/CodeGen/R600/si-annotate-cf-assertion.ll
deleted file mode 100644
index 69d719385ac..00000000000
--- a/test/CodeGen/R600/si-annotate-cf-assertion.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; REQUIRES: asserts
-; XFAIL: *
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
-
-
-define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
-; CHECK-LABEL: {{^}}test:
-
-entry:
-  switch i32 %x, label %sw.default [
-    i32 0, label %sw.bb
-    i32 60, label %sw.bb
-  ]
-
-sw.bb:
-  unreachable
-
-sw.default:
-  unreachable
-
-sw.epilog:
-  ret void
-}
-
diff --git a/test/CodeGen/R600/si-annotate-cf.ll b/test/CodeGen/R600/si-annotate-cf.ll
deleted file mode 100644
index bbcb861f37d..00000000000
--- a/test/CodeGen/R600/si-annotate-cf.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:
-
-; SI: [[LOOP_LABEL:[A-Z0-9]+]]:
-; Lowered break instructin:
-; SI: s_or_b64
-; Lowered Loop instruction:
-; SI: s_andn2_b64
-; s_cbranch_execnz [[LOOP_LABEL]]
-; SI: s_endpgm
-define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-main_body:
-  %0 = and i32 %a, %b
-  %1 = trunc i32 %0 to i1
-  br label %ENDIF
-
-ENDLOOP:
-  store i32 0, i32 addrspace(1)* %out
-  ret void
-
-ENDIF:
-  br i1 %1, label %ENDLOOP, label %ENDIF
-}
-
-
-; FUNC-LABEL: {{^}}phi_cond_outside_loop:
-; FIXME: This could be folded into the s_or_b64 instruction
-; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0
-; SI: [[LOOP_LABEL:[A-Z0-9]+]]
-; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
-
-; SI_IF_BREAK instruction:
-; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]]
-
-; SI_LOOP instruction:
-; SI: s_andn2_b64 exec, exec, [[BREAK]]
-; SI: s_cbranch_execnz [[LOOP_LABEL]]
-; SI: s_endpgm
-
-define void @phi_cond_outside_loop(i32 %a, i32 %b) {
-entry:
-  %0 = icmp eq i32 %a , 0
-  br i1 %0, label %if, label %else
-
-if:
-  br label %endif
-
-else:
-  %1 = icmp eq i32 %b, 0
-  br label %endif
-
-endif:
-  %2 = phi i1 [0, %if], [%1, %else]
-  br label %loop
-
-loop:
-  br i1 %2, label %exit, label %loop
-
-exit:
-  ret void
-}
diff --git a/test/CodeGen/R600/si-lod-bias.ll b/test/CodeGen/R600/si-lod-bias.ll
deleted file mode 100644
index 944499a1146..00000000000
--- a/test/CodeGen/R600/si-lod-bias.ll
+++ /dev/null
@@ -1,52 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; This shader has the potential to generated illegal VGPR to SGPR copies if
-; the wrong register class is used for the REG_SEQUENCE instructions.
-
-; CHECK: {{^}}main:
-; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}}
-
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
-  %23 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
-  %24 = load <32 x i8>, <32 x i8> addrspace(2)* %23, !tbaa !1
-  %25 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0
-  %26 = load <16 x i8>, <16 x i8> addrspace(2)* %25, !tbaa !1
-  %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
-  %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
-  %29 = bitcast float %22 to i32
-  %30 = bitcast float %27 to i32
-  %31 = bitcast float %28 to i32
-  %32 = insertelement <4 x i32> undef, i32 %29, i32 0
-  %33 = insertelement <4 x i32> %32, i32 %30, i32 1
-  %34 = insertelement <4 x i32> %33, i32 %31, i32 2
-  %35 = insertelement <4 x i32> %34, i32 undef, i32 3
-  %36 = call <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32> %35, <32 x i8> %24, <16 x i8> %26, i32 2)
-  %37 = extractelement <4 x float> %36, i32 0
-  %38 = extractelement <4 x float> %36, i32 1
-  %39 = extractelement <4 x float> %36, i32 2
-  %40 = extractelement <4 x float> %36, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %37, float %38, float %39, float %40)
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null}
-!1 = !{!0, !0, i64 0, i32 1}
diff --git a/test/CodeGen/R600/si-sgpr-spill.ll b/test/CodeGen/R600/si-sgpr-spill.ll
deleted file mode 100644
index 84652701f77..00000000000
--- a/test/CodeGen/R600/si-sgpr-spill.ll
+++ /dev/null
@@ -1,1568 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s
-
-; These tests check that the compiler won't crash when it needs to spill
-; SGPRs.
-
-; CHECK-LABEL: {{^}}main:
-; CHECK: s_wqm
-; Writing to M0 from an SMRD instruction will hang the GPU.
-; CHECK-NOT: s_buffer_load_dword m0
-; CHECK: s_endpgm
-@ddxy_lds = external addrspace(3) global [64 x i32]
-
-define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-main_body:
-  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
-  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0
-  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 96)
-  %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 100)
-  %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 104)
-  %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 112)
-  %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 116)
-  %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 120)
-  %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128)
-  %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132)
-  %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 140)
-  %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144)
-  %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160)
-  %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176)
-  %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180)
-  %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184)
-  %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192)
-  %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196)
-  %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200)
-  %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208)
-  %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212)
-  %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216)
-  %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 224)
-  %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240)
-  %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244)
-  %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248)
-  %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256)
-  %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272)
-  %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276)
-  %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280)
-  %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288)
-  %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292)
-  %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 296)
-  %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 304)
-  %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 308)
-  %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 312)
-  %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 368)
-  %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 372)
-  %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 376)
-  %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 384)
-  %61 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
-  %62 = load <32 x i8>, <32 x i8> addrspace(2)* %61, !tbaa !0
-  %63 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
-  %64 = load <16 x i8>, <16 x i8> addrspace(2)* %63, !tbaa !0
-  %65 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
-  %66 = load <32 x i8>, <32 x i8> addrspace(2)* %65, !tbaa !0
-  %67 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
-  %68 = load <16 x i8>, <16 x i8> addrspace(2)* %67, !tbaa !0
-  %69 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
-  %70 = load <32 x i8>, <32 x i8> addrspace(2)* %69, !tbaa !0
-  %71 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
-  %72 = load <16 x i8>, <16 x i8> addrspace(2)* %71, !tbaa !0
-  %73 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
-  %74 = load <32 x i8>, <32 x i8> addrspace(2)* %73, !tbaa !0
-  %75 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
-  %76 = load <16 x i8>, <16 x i8> addrspace(2)* %75, !tbaa !0
-  %77 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
-  %78 = load <32 x i8>, <32 x i8> addrspace(2)* %77, !tbaa !0
-  %79 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
-  %80 = load <16 x i8>, <16 x i8> addrspace(2)* %79, !tbaa !0
-  %81 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
-  %82 = load <32 x i8>, <32 x i8> addrspace(2)* %81, !tbaa !0
-  %83 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
-  %84 = load <16 x i8>, <16 x i8> addrspace(2)* %83, !tbaa !0
-  %85 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
-  %86 = load <32 x i8>, <32 x i8> addrspace(2)* %85, !tbaa !0
-  %87 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
-  %88 = load <16 x i8>, <16 x i8> addrspace(2)* %87, !tbaa !0
-  %89 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
-  %90 = load <32 x i8>, <32 x i8> addrspace(2)* %89, !tbaa !0
-  %91 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
-  %92 = load <16 x i8>, <16 x i8> addrspace(2)* %91, !tbaa !0
-  %93 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
-  %94 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6)
-  %95 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6)
-  %96 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6)
-  %97 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6)
-  %98 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6)
-  %99 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6)
-  %100 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6)
-  %101 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6)
-  %102 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6)
-  %103 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6)
-  %104 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6)
-  %105 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6)
-  %106 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6)
-  %107 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6)
-  %108 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6)
-  %109 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6)
-  %110 = call i32 @llvm.SI.tid()
-  %111 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %110
-  %112 = bitcast float %93 to i32
-  store i32 %112, i32 addrspace(3)* %111
-  %113 = bitcast float %94 to i32
-  store i32 %113, i32 addrspace(3)* %111
-  %114 = call i32 @llvm.SI.tid()
-  %115 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %114
-  %116 = and i32 %114, -4
-  %117 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %116
-  %118 = add i32 %116, 1
-  %119 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %118
-  %120 = bitcast float %93 to i32
-  store i32 %120, i32 addrspace(3)* %115
-  %121 = load i32, i32 addrspace(3)* %117
-  %122 = bitcast i32 %121 to float
-  %123 = load i32, i32 addrspace(3)* %119
-  %124 = bitcast i32 %123 to float
-  %125 = fsub float %124, %122
-  %126 = bitcast float %94 to i32
-  store i32 %126, i32 addrspace(3)* %115
-  %127 = load i32, i32 addrspace(3)* %117
-  %128 = bitcast i32 %127 to float
-  %129 = load i32, i32 addrspace(3)* %119
-  %130 = bitcast i32 %129 to float
-  %131 = fsub float %130, %128
-  %132 = insertelement <4 x float> undef, float %125, i32 0
-  %133 = insertelement <4 x float> %132, float %131, i32 1
-  %134 = insertelement <4 x float> %133, float %131, i32 2
-  %135 = insertelement <4 x float> %134, float %131, i32 3
-  %136 = extractelement <4 x float> %135, i32 0
-  %137 = extractelement <4 x float> %135, i32 1
-  %138 = fmul float %60, %93
-  %139 = fmul float %60, %94
-  %140 = fmul float %60, %94
-  %141 = fmul float %60, %94
-  %142 = call i32 @llvm.SI.tid()
-  %143 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %142
-  %144 = bitcast float %138 to i32
-  store i32 %144, i32 addrspace(3)* %143
-  %145 = bitcast float %139 to i32
-  store i32 %145, i32 addrspace(3)* %143
-  %146 = bitcast float %140 to i32
-  store i32 %146, i32 addrspace(3)* %143
-  %147 = bitcast float %141 to i32
-  store i32 %147, i32 addrspace(3)* %143
-  %148 = call i32 @llvm.SI.tid()
-  %149 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %148
-  %150 = and i32 %148, -4
-  %151 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %150
-  %152 = add i32 %150, 2
-  %153 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %152
-  %154 = bitcast float %138 to i32
-  store i32 %154, i32 addrspace(3)* %149
-  %155 = load i32, i32 addrspace(3)* %151
-  %156 = bitcast i32 %155 to float
-  %157 = load i32, i32 addrspace(3)* %153
-  %158 = bitcast i32 %157 to float
-  %159 = fsub float %158, %156
-  %160 = bitcast float %139 to i32
-  store i32 %160, i32 addrspace(3)* %149
-  %161 = load i32, i32 addrspace(3)* %151
-  %162 = bitcast i32 %161 to float
-  %163 = load i32, i32 addrspace(3)* %153
-  %164 = bitcast i32 %163 to float
-  %165 = fsub float %164, %162
-  %166 = bitcast float %140 to i32
-  store i32 %166, i32 addrspace(3)* %149
-  %167 = load i32, i32 addrspace(3)* %151
-  %168 = bitcast i32 %167 to float
-  %169 = load i32, i32 addrspace(3)* %153
-  %170 = bitcast i32 %169 to float
-  %171 = fsub float %170, %168
-  %172 = bitcast float %141 to i32
-  store i32 %172, i32 addrspace(3)* %149
-  %173 = load i32, i32 addrspace(3)* %151
-  %174 = bitcast i32 %173 to float
-  %175 = load i32, i32 addrspace(3)* %153
-  %176 = bitcast i32 %175 to float
-  %177 = fsub float %176, %174
-  %178 = insertelement <4 x float> undef, float %159, i32 0
-  %179 = insertelement <4 x float> %178, float %165, i32 1
-  %180 = insertelement <4 x float> %179, float %171, i32 2
-  %181 = insertelement <4 x float> %180, float %177, i32 3
-  %182 = extractelement <4 x float> %181, i32 0
-  %183 = extractelement <4 x float> %181, i32 1
-  %184 = fdiv float 1.000000e+00, %97
-  %185 = fmul float %33, %184
-  %186 = fcmp uge float 1.000000e+00, %185
-  %187 = select i1 %186, float %185, float 1.000000e+00
-  %188 = fmul float %187, %30
-  %189 = call float @ceil(float %188)
-  %190 = fcmp uge float 3.000000e+00, %189
-  %191 = select i1 %190, float 3.000000e+00, float %189
-  %192 = fdiv float 1.000000e+00, %191
-  %193 = fdiv float 1.000000e+00, %30
-  %194 = fmul float %191, %193
-  %195 = fmul float %31, %194
-  %196 = fmul float %95, %95
-  %197 = fmul float %96, %96
-  %198 = fadd float %197, %196
-  %199 = fmul float %97, %97
-  %200 = fadd float %198, %199
-  %201 = call float @llvm.AMDGPU.rsq.f32(float %200)
-  %202 = fmul float %95, %201
-  %203 = fmul float %96, %201
-  %204 = fmul float %202, %29
-  %205 = fmul float %203, %29
-  %206 = fmul float %204, -1.000000e+00
-  %207 = fmul float %205, 1.000000e+00
-  %208 = fmul float %206, %32
-  %209 = fmul float %207, %32
-  %210 = fsub float -0.000000e+00, %208
-  %211 = fadd float %93, %210
-  %212 = fsub float -0.000000e+00, %209
-  %213 = fadd float %94, %212
-  %214 = fmul float %206, %192
-  %215 = fmul float %207, %192
-  %216 = fmul float -1.000000e+00, %192
-  %217 = bitcast float %136 to i32
-  %218 = bitcast float %182 to i32
-  %219 = bitcast float %137 to i32
-  %220 = bitcast float %183 to i32
-  %221 = insertelement <8 x i32> undef, i32 %217, i32 0
-  %222 = insertelement <8 x i32> %221, i32 %218, i32 1
-  %223 = insertelement <8 x i32> %222, i32 %219, i32 2
-  %224 = insertelement <8 x i32> %223, i32 %220, i32 3
-  br label %LOOP
-
-LOOP:                                             ; preds = %ENDIF, %main_body
-  %temp24.0 = phi float [ 1.000000e+00, %main_body ], [ %258, %ENDIF ]
-  %temp28.0 = phi float [ %211, %main_body ], [ %253, %ENDIF ]
-  %temp29.0 = phi float [ %213, %main_body ], [ %255, %ENDIF ]
-  %temp30.0 = phi float [ 1.000000e+00, %main_body ], [ %257, %ENDIF ]
-  %225 = fcmp oge float %temp24.0, %191
-  %226 = sext i1 %225 to i32
-  %227 = bitcast i32 %226 to float
-  %228 = bitcast float %227 to i32
-  %229 = icmp ne i32 %228, 0
-  br i1 %229, label %IF, label %ENDIF
-
-IF:                                               ; preds = %LOOP
-  %230 = bitcast float %136 to i32
-  %231 = bitcast float %182 to i32
-  %232 = bitcast float %137 to i32
-  %233 = bitcast float %183 to i32
-  %234 = insertelement <8 x i32> undef, i32 %230, i32 0
-  %235 = insertelement <8 x i32> %234, i32 %231, i32 1
-  %236 = insertelement <8 x i32> %235, i32 %232, i32 2
-  %237 = insertelement <8 x i32> %236, i32 %233, i32 3
-  br label %LOOP65
-
-ENDIF:                                            ; preds = %LOOP
-  %238 = bitcast float %temp28.0 to i32
-  %239 = bitcast float %temp29.0 to i32
-  %240 = insertelement <8 x i32> %224, i32 %238, i32 4
-  %241 = insertelement <8 x i32> %240, i32 %239, i32 5
-  %242 = insertelement <8 x i32> %241, i32 undef, i32 6
-  %243 = insertelement <8 x i32> %242, i32 undef, i32 7
-  %244 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %243, <32 x i8> %62, <16 x i8> %64, i32 2)
-  %245 = extractelement <4 x float> %244, i32 3
-  %246 = fcmp oge float %temp30.0, %245
-  %247 = sext i1 %246 to i32
-  %248 = bitcast i32 %247 to float
-  %249 = bitcast float %248 to i32
-  %250 = and i32 %249, 1065353216
-  %251 = bitcast i32 %250 to float
-  %252 = fmul float %214, %251
-  %253 = fadd float %252, %temp28.0
-  %254 = fmul float %215, %251
-  %255 = fadd float %254, %temp29.0
-  %256 = fmul float %216, %251
-  %257 = fadd float %256, %temp30.0
-  %258 = fadd float %temp24.0, 1.000000e+00
-  br label %LOOP
-
-LOOP65:                                           ; preds = %ENDIF66, %IF
-  %temp24.1 = phi float [ 0.000000e+00, %IF ], [ %610, %ENDIF66 ]
-  %temp28.1 = phi float [ %temp28.0, %IF ], [ %605, %ENDIF66 ]
-  %temp29.1 = phi float [ %temp29.0, %IF ], [ %607, %ENDIF66 ]
-  %temp30.1 = phi float [ %temp30.0, %IF ], [ %609, %ENDIF66 ]
-  %temp32.0 = phi float [ 1.000000e+00, %IF ], [ %611, %ENDIF66 ]
-  %259 = fcmp oge float %temp24.1, %195
-  %260 = sext i1 %259 to i32
-  %261 = bitcast i32 %260 to float
-  %262 = bitcast float %261 to i32
-  %263 = icmp ne i32 %262, 0
-  br i1 %263, label %IF67, label %ENDIF66
-
-IF67:                                             ; preds = %LOOP65
-  %264 = bitcast float %136 to i32
-  %265 = bitcast float %182 to i32
-  %266 = bitcast float %137 to i32
-  %267 = bitcast float %183 to i32
-  %268 = bitcast float %temp28.1 to i32
-  %269 = bitcast float %temp29.1 to i32
-  %270 = insertelement <8 x i32> undef, i32 %264, i32 0
-  %271 = insertelement <8 x i32> %270, i32 %265, i32 1
-  %272 = insertelement <8 x i32> %271, i32 %266, i32 2
-  %273 = insertelement <8 x i32> %272, i32 %267, i32 3
-  %274 = insertelement <8 x i32> %273, i32 %268, i32 4
-  %275 = insertelement <8 x i32> %274, i32 %269, i32 5
-  %276 = insertelement <8 x i32> %275, i32 undef, i32 6
-  %277 = insertelement <8 x i32> %276, i32 undef, i32 7
-  %278 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %277, <32 x i8> %66, <16 x i8> %68, i32 2)
-  %279 = extractelement <4 x float> %278, i32 0
-  %280 = extractelement <4 x float> %278, i32 1
-  %281 = extractelement <4 x float> %278, i32 2
-  %282 = extractelement <4 x float> %278, i32 3
-  %283 = fmul float %282, %47
-  %284 = bitcast float %136 to i32
-  %285 = bitcast float %182 to i32
-  %286 = bitcast float %137 to i32
-  %287 = bitcast float %183 to i32
-  %288 = bitcast float %temp28.1 to i32
-  %289 = bitcast float %temp29.1 to i32
-  %290 = insertelement <8 x i32> undef, i32 %284, i32 0
-  %291 = insertelement <8 x i32> %290, i32 %285, i32 1
-  %292 = insertelement <8 x i32> %291, i32 %286, i32 2
-  %293 = insertelement <8 x i32> %292, i32 %287, i32 3
-  %294 = insertelement <8 x i32> %293, i32 %288, i32 4
-  %295 = insertelement <8 x i32> %294, i32 %289, i32 5
-  %296 = insertelement <8 x i32> %295, i32 undef, i32 6
-  %297 = insertelement <8 x i32> %296, i32 undef, i32 7
-  %298 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %297, <32 x i8> %82, <16 x i8> %84, i32 2)
-  %299 = extractelement <4 x float> %298, i32 0
-  %300 = extractelement <4 x float> %298, i32 1
-  %301 = extractelement <4 x float> %298, i32 2
-  %302 = bitcast float %136 to i32
-  %303 = bitcast float %182 to i32
-  %304 = bitcast float %137 to i32
-  %305 = bitcast float %183 to i32
-  %306 = bitcast float %temp28.1 to i32
-  %307 = bitcast float %temp29.1 to i32
-  %308 = insertelement <8 x i32> undef, i32 %302, i32 0
-  %309 = insertelement <8 x i32> %308, i32 %303, i32 1
-  %310 = insertelement <8 x i32> %309, i32 %304, i32 2
-  %311 = insertelement <8 x i32> %310, i32 %305, i32 3
-  %312 = insertelement <8 x i32> %311, i32 %306, i32 4
-  %313 = insertelement <8 x i32> %312, i32 %307, i32 5
-  %314 = insertelement <8 x i32> %313, i32 undef, i32 6
-  %315 = insertelement <8 x i32> %314, i32 undef, i32 7
-  %316 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %315, <32 x i8> %78, <16 x i8> %80, i32 2)
-  %317 = extractelement <4 x float> %316, i32 0
-  %318 = extractelement <4 x float> %316, i32 1
-  %319 = extractelement <4 x float> %316, i32 2
-  %320 = fmul float %317, %23
-  %321 = fmul float %318, %24
-  %322 = fmul float %319, %25
-  %323 = fmul float %299, %26
-  %324 = fadd float %323, %320
-  %325 = fmul float %300, %27
-  %326 = fadd float %325, %321
-  %327 = fmul float %301, %28
-  %328 = fadd float %327, %322
-  %329 = fadd float %279, %324
-  %330 = fadd float %280, %326
-  %331 = fadd float %281, %328
-  %332 = bitcast float %136 to i32
-  %333 = bitcast float %182 to i32
-  %334 = bitcast float %137 to i32
-  %335 = bitcast float %183 to i32
-  %336 = bitcast float %temp28.1 to i32
-  %337 = bitcast float %temp29.1 to i32
-  %338 = insertelement <8 x i32> undef, i32 %332, i32 0
-  %339 = insertelement <8 x i32> %338, i32 %333, i32 1
-  %340 = insertelement <8 x i32> %339, i32 %334, i32 2
-  %341 = insertelement <8 x i32> %340, i32 %335, i32 3
-  %342 = insertelement <8 x i32> %341, i32 %336, i32 4
-  %343 = insertelement <8 x i32> %342, i32 %337, i32 5
-  %344 = insertelement <8 x i32> %343, i32 undef, i32 6
-  %345 = insertelement <8 x i32> %344, i32 undef, i32 7
-  %346 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %345, <32 x i8> %62, <16 x i8> %64, i32 2)
-  %347 = extractelement <4 x float> %346, i32 0
-  %348 = extractelement <4 x float> %346, i32 1
-  %349 = extractelement <4 x float> %346, i32 2
-  %350 = fadd float %347, -5.000000e-01
-  %351 = fadd float %348, -5.000000e-01
-  %352 = fadd float %349, -5.000000e-01
-  %353 = fmul float %350, %350
-  %354 = fmul float %351, %351
-  %355 = fadd float %354, %353
-  %356 = fmul float %352, %352
-  %357 = fadd float %355, %356
-  %358 = call float @llvm.AMDGPU.rsq.f32(float %357)
-  %359 = fmul float %350, %358
-  %360 = fmul float %351, %358
-  %361 = fmul float %352, %358
-  %362 = bitcast float %136 to i32
-  %363 = bitcast float %182 to i32
-  %364 = bitcast float %137 to i32
-  %365 = bitcast float %183 to i32
-  %366 = bitcast float %temp28.1 to i32
-  %367 = bitcast float %temp29.1 to i32
-  %368 = insertelement <8 x i32> undef, i32 %362, i32 0
-  %369 = insertelement <8 x i32> %368, i32 %363, i32 1
-  %370 = insertelement <8 x i32> %369, i32 %364, i32 2
-  %371 = insertelement <8 x i32> %370, i32 %365, i32 3
-  %372 = insertelement <8 x i32> %371, i32 %366, i32 4
-  %373 = insertelement <8 x i32> %372, i32 %367, i32 5
-  %374 = insertelement <8 x i32> %373, i32 undef, i32 6
-  %375 = insertelement <8 x i32> %374, i32 undef, i32 7
-  %376 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %375, <32 x i8> %70, <16 x i8> %72, i32 2)
-  %377 = extractelement <4 x float> %376, i32 0
-  %378 = extractelement <4 x float> %376, i32 1
-  %379 = extractelement <4 x float> %376, i32 2
-  %380 = extractelement <4 x float> %376, i32 3
-  %381 = fsub float -0.000000e+00, %95
-  %382 = fsub float -0.000000e+00, %96
-  %383 = fsub float -0.000000e+00, %97
-  %384 = fmul float %359, %381
-  %385 = fmul float %360, %382
-  %386 = fadd float %385, %384
-  %387 = fmul float %361, %383
-  %388 = fadd float %386, %387
-  %389 = fmul float %388, %359
-  %390 = fmul float %388, %360
-  %391 = fmul float %388, %361
-  %392 = fmul float 2.000000e+00, %389
-  %393 = fmul float 2.000000e+00, %390
-  %394 = fmul float 2.000000e+00, %391
-  %395 = fsub float -0.000000e+00, %392
-  %396 = fadd float %381, %395
-  %397 = fsub float -0.000000e+00, %393
-  %398 = fadd float %382, %397
-  %399 = fsub float -0.000000e+00, %394
-  %400 = fadd float %383, %399
-  %401 = fmul float %396, %98
-  %402 = fmul float %396, %99
-  %403 = fmul float %396, %100
-  %404 = fmul float %398, %101
-  %405 = fadd float %404, %401
-  %406 = fmul float %398, %102
-  %407 = fadd float %406, %402
-  %408 = fmul float %398, %103
-  %409 = fadd float %408, %403
-  %410 = fmul float %400, %104
-  %411 = fadd float %410, %405
-  %412 = fmul float %400, %105
-  %413 = fadd float %412, %407
-  %414 = fmul float %400, %106
-  %415 = fadd float %414, %409
-  %416 = bitcast float %136 to i32
-  %417 = bitcast float %182 to i32
-  %418 = bitcast float %137 to i32
-  %419 = bitcast float %183 to i32
-  %420 = bitcast float %temp28.1 to i32
-  %421 = bitcast float %temp29.1 to i32
-  %422 = insertelement <8 x i32> undef, i32 %416, i32 0
-  %423 = insertelement <8 x i32> %422, i32 %417, i32 1
-  %424 = insertelement <8 x i32> %423, i32 %418, i32 2
-  %425 = insertelement <8 x i32> %424, i32 %419, i32 3
-  %426 = insertelement <8 x i32> %425, i32 %420, i32 4
-  %427 = insertelement <8 x i32> %426, i32 %421, i32 5
-  %428 = insertelement <8 x i32> %427, i32 undef, i32 6
-  %429 = insertelement <8 x i32> %428, i32 undef, i32 7
-  %430 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %429, <32 x i8> %86, <16 x i8> %88, i32 2)
-  %431 = extractelement <4 x float> %430, i32 0
-  %432 = extractelement <4 x float> %430, i32 1
-  %433 = extractelement <4 x float> %430, i32 2
-  %434 = fmul float %48, %411
-  %435 = fmul float %49, %411
-  %436 = fmul float %50, %411
-  %437 = fmul float %51, %413
-  %438 = fadd float %437, %434
-  %439 = fmul float %52, %413
-  %440 = fadd float %439, %435
-  %441 = fmul float %53, %413
-  %442 = fadd float %441, %436
-  %443 = fmul float %54, %415
-  %444 = fadd float %443, %438
-  %445 = fmul float %55, %415
-  %446 = fadd float %445, %440
-  %447 = fmul float %56, %415
-  %448 = fadd float %447, %442
-  %449 = insertelement <4 x float> undef, float %444, i32 0
-  %450 = insertelement <4 x float> %449, float %446, i32 1
-  %451 = insertelement <4 x float> %450, float %448, i32 2
-  %452 = insertelement <4 x float> %451, float %195, i32 3
-  %453 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %452)
-  %454 = extractelement <4 x float> %453, i32 0
-  %455 = extractelement <4 x float> %453, i32 1
-  %456 = extractelement <4 x float> %453, i32 2
-  %457 = extractelement <4 x float> %453, i32 3
-  %458 = call float @fabs(float %456)
-  %459 = fdiv float 1.000000e+00, %458
-  %460 = fmul float %454, %459
-  %461 = fadd float %460, 1.500000e+00
-  %462 = fmul float %455, %459
-  %463 = fadd float %462, 1.500000e+00
-  %464 = bitcast float %463 to i32
-  %465 = bitcast float %461 to i32
-  %466 = bitcast float %457 to i32
-  %467 = insertelement <4 x i32> undef, i32 %464, i32 0
-  %468 = insertelement <4 x i32> %467, i32 %465, i32 1
-  %469 = insertelement <4 x i32> %468, i32 %466, i32 2
-  %470 = insertelement <4 x i32> %469, i32 undef, i32 3
-  %471 = call <4 x float> @llvm.SI.sample.v4i32(<4 x i32> %470, <32 x i8> %90, <16 x i8> %92, i32 4)
-  %472 = extractelement <4 x float> %471, i32 0
-  %473 = extractelement <4 x float> %471, i32 1
-  %474 = extractelement <4 x float> %471, i32 2
-  %475 = fmul float %431, %472
-  %476 = fadd float %475, %329
-  %477 = fmul float %432, %473
-  %478 = fadd float %477, %330
-  %479 = fmul float %433, %474
-  %480 = fadd float %479, %331
-  %481 = fmul float %107, %107
-  %482 = fmul float %108, %108
-  %483 = fadd float %482, %481
-  %484 = fmul float %109, %109
-  %485 = fadd float %483, %484
-  %486 = call float @llvm.AMDGPU.rsq.f32(float %485)
-  %487 = fmul float %107, %486
-  %488 = fmul float %108, %486
-  %489 = fmul float %109, %486
-  %490 = fmul float %377, %40
-  %491 = fmul float %378, %41
-  %492 = fmul float %379, %42
-  %493 = fmul float %359, %487
-  %494 = fmul float %360, %488
-  %495 = fadd float %494, %493
-  %496 = fmul float %361, %489
-  %497 = fadd float %495, %496
-  %498 = fmul float %497, %359
-  %499 = fmul float %497, %360
-  %500 = fmul float %497, %361
-  %501 = fmul float 2.000000e+00, %498
-  %502 = fmul float 2.000000e+00, %499
-  %503 = fmul float 2.000000e+00, %500
-  %504 = fsub float -0.000000e+00, %501
-  %505 = fadd float %487, %504
-  %506 = fsub float -0.000000e+00, %502
-  %507 = fadd float %488, %506
-  %508 = fsub float -0.000000e+00, %503
-  %509 = fadd float %489, %508
-  %510 = fmul float %95, %95
-  %511 = fmul float %96, %96
-  %512 = fadd float %511, %510
-  %513 = fmul float %97, %97
-  %514 = fadd float %512, %513
-  %515 = call float @llvm.AMDGPU.rsq.f32(float %514)
-  %516 = fmul float %95, %515
-  %517 = fmul float %96, %515
-  %518 = fmul float %97, %515
-  %519 = fmul float %505, %516
-  %520 = fmul float %507, %517
-  %521 = fadd float %520, %519
-  %522 = fmul float %509, %518
-  %523 = fadd float %521, %522
-  %524 = fsub float -0.000000e+00, %523
-  %525 = fcmp uge float %524, 0.000000e+00
-  %526 = select i1 %525, float %524, float 0.000000e+00
-  %527 = fmul float %43, %380
-  %528 = fadd float %527, 1.000000e+00
-  %529 = call float @llvm.pow.f32(float %526, float %528)
-  %530 = fmul float %476, %37
-  %531 = fmul float %478, %38
-  %532 = fmul float %480, %39
-  %533 = fmul float %359, %487
-  %534 = fmul float %360, %488
-  %535 = fadd float %534, %533
-  %536 = fmul float %361, %489
-  %537 = fadd float %535, %536
-  %538 = fcmp uge float %537, 0.000000e+00
-  %539 = select i1 %538, float %537, float 0.000000e+00
-  %540 = fmul float %530, %539
-  %541 = fmul float %531, %539
-  %542 = fmul float %532, %539
-  %543 = fmul float %490, %529
-  %544 = fadd float %543, %540
-  %545 = fmul float %491, %529
-  %546 = fadd float %545, %541
-  %547 = fmul float %492, %529
-  %548 = fadd float %547, %542
-  %549 = fmul float %476, %34
-  %550 = fmul float %478, %35
-  %551 = fmul float %480, %36
-  %552 = fmul float %544, %57
-  %553 = fadd float %552, %549
-  %554 = fmul float %546, %58
-  %555 = fadd float %554, %550
-  %556 = fmul float %548, %59
-  %557 = fadd float %556, %551
-  %558 = bitcast float %136 to i32
-  %559 = bitcast float %182 to i32
-  %560 = bitcast float %137 to i32
-  %561 = bitcast float %183 to i32
-  %562 = bitcast float %temp28.1 to i32
-  %563 = bitcast float %temp29.1 to i32
-  %564 = insertelement <8 x i32> undef, i32 %558, i32 0
-  %565 = insertelement <8 x i32> %564, i32 %559, i32 1
-  %566 = insertelement <8 x i32> %565, i32 %560, i32 2
-  %567 = insertelement <8 x i32> %566, i32 %561, i32 3
-  %568 = insertelement <8 x i32> %567, i32 %562, i32 4
-  %569 = insertelement <8 x i32> %568, i32 %563, i32 5
-  %570 = insertelement <8 x i32> %569, i32 undef, i32 6
-  %571 = insertelement <8 x i32> %570, i32 undef, i32 7
-  %572 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %571, <32 x i8> %74, <16 x i8> %76, i32 2)
-  %573 = extractelement <4 x float> %572, i32 0
-  %574 = extractelement <4 x float> %572, i32 1
-  %575 = extractelement <4 x float> %572, i32 2
-  %576 = fmul float %573, %44
-  %577 = fadd float %576, %553
-  %578 = fmul float %574, %45
-  %579 = fadd float %578, %555
-  %580 = fmul float %575, %46
-  %581 = fadd float %580, %557
-  %582 = call i32 @llvm.SI.packf16(float %577, float %579)
-  %583 = bitcast i32 %582 to float
-  %584 = call i32 @llvm.SI.packf16(float %581, float %283)
-  %585 = bitcast i32 %584 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %583, float %585, float %583, float %585)
-  ret void
-
-ENDIF66:                                          ; preds = %LOOP65
-  %586 = bitcast float %temp28.1 to i32
-  %587 = bitcast float %temp29.1 to i32
-  %588 = insertelement <8 x i32> %237, i32 %586, i32 4
-  %589 = insertelement <8 x i32> %588, i32 %587, i32 5
-  %590 = insertelement <8 x i32> %589, i32 undef, i32 6
-  %591 = insertelement <8 x i32> %590, i32 undef, i32 7
-  %592 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %591, <32 x i8> %62, <16 x i8> %64, i32 2)
-  %593 = extractelement <4 x float> %592, i32 3
-  %594 = fcmp oge float %temp30.1, %593
-  %595 = sext i1 %594 to i32
-  %596 = bitcast i32 %595 to float
-  %597 = bitcast float %596 to i32
-  %598 = and i32 %597, 1065353216
-  %599 = bitcast i32 %598 to float
-  %600 = fmul float 5.000000e-01, %temp32.0
-  %601 = fsub float -0.000000e+00, %600
-  %602 = fmul float %599, %temp32.0
-  %603 = fadd float %602, %601
-  %604 = fmul float %214, %603
-  %605 = fadd float %604, %temp28.1
-  %606 = fmul float %215, %603
-  %607 = fadd float %606, %temp29.1
-  %608 = fmul float %216, %603
-  %609 = fadd float %608, %temp30.1
-  %610 = fadd float %temp24.1, 1.000000e+00
-  %611 = fmul float %temp32.0, 5.000000e-01
-  br label %LOOP65
-}
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-
-; Function Attrs: readnone
-declare i32 @llvm.SI.tid() #2
-
-; Function Attrs: readonly
-declare float @ceil(float) #3
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq.f32(float) #2
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1
-
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #2
-
-; Function Attrs: readnone
-declare float @fabs(float) #2
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
-
-; Function Attrs: nounwind readonly
-declare float @llvm.pow.f32(float, float) #4
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
-attributes #3 = { readonly }
-attributes #4 = { nounwind readonly }
-
-!0 = !{!"const", null, i32 1}
-
-; CHECK-LABEL: {{^}}main1:
-; CHECK: s_endpgm
-define void @main1([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-main_body:
-  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
-  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0
-  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 0)
-  %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 4)
-  %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 8)
-  %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 12)
-  %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 28)
-  %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 48)
-  %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 52)
-  %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 56)
-  %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 64)
-  %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 68)
-  %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 72)
-  %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 76)
-  %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128)
-  %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132)
-  %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144)
-  %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 148)
-  %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 152)
-  %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160)
-  %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 164)
-  %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 168)
-  %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 172)
-  %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176)
-  %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180)
-  %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184)
-  %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192)
-  %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196)
-  %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200)
-  %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208)
-  %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212)
-  %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216)
-  %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 220)
-  %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 236)
-  %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240)
-  %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244)
-  %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248)
-  %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 252)
-  %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256)
-  %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 260)
-  %61 = call float @llvm.SI.load.const(<16 x i8> %22, i32 264)
-  %62 = call float @llvm.SI.load.const(<16 x i8> %22, i32 268)
-  %63 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272)
-  %64 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276)
-  %65 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280)
-  %66 = call float @llvm.SI.load.const(<16 x i8> %22, i32 284)
-  %67 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288)
-  %68 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292)
-  %69 = call float @llvm.SI.load.const(<16 x i8> %22, i32 464)
-  %70 = call float @llvm.SI.load.const(<16 x i8> %22, i32 468)
-  %71 = call float @llvm.SI.load.const(<16 x i8> %22, i32 472)
-  %72 = call float @llvm.SI.load.const(<16 x i8> %22, i32 496)
-  %73 = call float @llvm.SI.load.const(<16 x i8> %22, i32 500)
-  %74 = call float @llvm.SI.load.const(<16 x i8> %22, i32 504)
-  %75 = call float @llvm.SI.load.const(<16 x i8> %22, i32 512)
-  %76 = call float @llvm.SI.load.const(<16 x i8> %22, i32 516)
-  %77 = call float @llvm.SI.load.const(<16 x i8> %22, i32 524)
-  %78 = call float @llvm.SI.load.const(<16 x i8> %22, i32 532)
-  %79 = call float @llvm.SI.load.const(<16 x i8> %22, i32 536)
-  %80 = call float @llvm.SI.load.const(<16 x i8> %22, i32 540)
-  %81 = call float @llvm.SI.load.const(<16 x i8> %22, i32 544)
-  %82 = call float @llvm.SI.load.const(<16 x i8> %22, i32 548)
-  %83 = call float @llvm.SI.load.const(<16 x i8> %22, i32 552)
-  %84 = call float @llvm.SI.load.const(<16 x i8> %22, i32 556)
-  %85 = call float @llvm.SI.load.const(<16 x i8> %22, i32 560)
-  %86 = call float @llvm.SI.load.const(<16 x i8> %22, i32 564)
-  %87 = call float @llvm.SI.load.const(<16 x i8> %22, i32 568)
-  %88 = call float @llvm.SI.load.const(<16 x i8> %22, i32 572)
-  %89 = call float @llvm.SI.load.const(<16 x i8> %22, i32 576)
-  %90 = call float @llvm.SI.load.const(<16 x i8> %22, i32 580)
-  %91 = call float @llvm.SI.load.const(<16 x i8> %22, i32 584)
-  %92 = call float @llvm.SI.load.const(<16 x i8> %22, i32 588)
-  %93 = call float @llvm.SI.load.const(<16 x i8> %22, i32 592)
-  %94 = call float @llvm.SI.load.const(<16 x i8> %22, i32 596)
-  %95 = call float @llvm.SI.load.const(<16 x i8> %22, i32 600)
-  %96 = call float @llvm.SI.load.const(<16 x i8> %22, i32 604)
-  %97 = call float @llvm.SI.load.const(<16 x i8> %22, i32 608)
-  %98 = call float @llvm.SI.load.const(<16 x i8> %22, i32 612)
-  %99 = call float @llvm.SI.load.const(<16 x i8> %22, i32 616)
-  %100 = call float @llvm.SI.load.const(<16 x i8> %22, i32 624)
-  %101 = call float @llvm.SI.load.const(<16 x i8> %22, i32 628)
-  %102 = call float @llvm.SI.load.const(<16 x i8> %22, i32 632)
-  %103 = call float @llvm.SI.load.const(<16 x i8> %22, i32 636)
-  %104 = call float @llvm.SI.load.const(<16 x i8> %22, i32 640)
-  %105 = call float @llvm.SI.load.const(<16 x i8> %22, i32 644)
-  %106 = call float @llvm.SI.load.const(<16 x i8> %22, i32 648)
-  %107 = call float @llvm.SI.load.const(<16 x i8> %22, i32 652)
-  %108 = call float @llvm.SI.load.const(<16 x i8> %22, i32 656)
-  %109 = call float @llvm.SI.load.const(<16 x i8> %22, i32 660)
-  %110 = call float @llvm.SI.load.const(<16 x i8> %22, i32 664)
-  %111 = call float @llvm.SI.load.const(<16 x i8> %22, i32 668)
-  %112 = call float @llvm.SI.load.const(<16 x i8> %22, i32 672)
-  %113 = call float @llvm.SI.load.const(<16 x i8> %22, i32 676)
-  %114 = call float @llvm.SI.load.const(<16 x i8> %22, i32 680)
-  %115 = call float @llvm.SI.load.const(<16 x i8> %22, i32 684)
-  %116 = call float @llvm.SI.load.const(<16 x i8> %22, i32 688)
-  %117 = call float @llvm.SI.load.const(<16 x i8> %22, i32 692)
-  %118 = call float @llvm.SI.load.const(<16 x i8> %22, i32 696)
-  %119 = call float @llvm.SI.load.const(<16 x i8> %22, i32 700)
-  %120 = call float @llvm.SI.load.const(<16 x i8> %22, i32 704)
-  %121 = call float @llvm.SI.load.const(<16 x i8> %22, i32 708)
-  %122 = call float @llvm.SI.load.const(<16 x i8> %22, i32 712)
-  %123 = call float @llvm.SI.load.const(<16 x i8> %22, i32 716)
-  %124 = call float @llvm.SI.load.const(<16 x i8> %22, i32 864)
-  %125 = call float @llvm.SI.load.const(<16 x i8> %22, i32 868)
-  %126 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
-  %127 = load <32 x i8>, <32 x i8> addrspace(2)* %126, !tbaa !0
-  %128 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
-  %129 = load <16 x i8>, <16 x i8> addrspace(2)* %128, !tbaa !0
-  %130 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
-  %131 = load <32 x i8>, <32 x i8> addrspace(2)* %130, !tbaa !0
-  %132 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
-  %133 = load <16 x i8>, <16 x i8> addrspace(2)* %132, !tbaa !0
-  %134 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
-  %135 = load <32 x i8>, <32 x i8> addrspace(2)* %134, !tbaa !0
-  %136 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
-  %137 = load <16 x i8>, <16 x i8> addrspace(2)* %136, !tbaa !0
-  %138 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
-  %139 = load <32 x i8>, <32 x i8> addrspace(2)* %138, !tbaa !0
-  %140 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
-  %141 = load <16 x i8>, <16 x i8> addrspace(2)* %140, !tbaa !0
-  %142 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
-  %143 = load <32 x i8>, <32 x i8> addrspace(2)* %142, !tbaa !0
-  %144 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
-  %145 = load <16 x i8>, <16 x i8> addrspace(2)* %144, !tbaa !0
-  %146 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
-  %147 = load <32 x i8>, <32 x i8> addrspace(2)* %146, !tbaa !0
-  %148 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
-  %149 = load <16 x i8>, <16 x i8> addrspace(2)* %148, !tbaa !0
-  %150 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
-  %151 = load <32 x i8>, <32 x i8> addrspace(2)* %150, !tbaa !0
-  %152 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
-  %153 = load <16 x i8>, <16 x i8> addrspace(2)* %152, !tbaa !0
-  %154 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
-  %155 = load <32 x i8>, <32 x i8> addrspace(2)* %154, !tbaa !0
-  %156 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
-  %157 = load <16 x i8>, <16 x i8> addrspace(2)* %156, !tbaa !0
-  %158 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 8
-  %159 = load <32 x i8>, <32 x i8> addrspace(2)* %158, !tbaa !0
-  %160 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 8
-  %161 = load <16 x i8>, <16 x i8> addrspace(2)* %160, !tbaa !0
-  %162 = fcmp ugt float %17, 0.000000e+00
-  %163 = select i1 %162, float 1.000000e+00, float 0.000000e+00
-  %164 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
-  %165 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6)
-  %166 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %4, <2 x i32> %6)
-  %167 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %4, <2 x i32> %6)
-  %168 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6)
-  %169 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6)
-  %170 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6)
-  %171 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %4, <2 x i32> %6)
-  %172 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6)
-  %173 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6)
-  %174 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6)
-  %175 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %4, <2 x i32> %6)
-  %176 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6)
-  %177 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6)
-  %178 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6)
-  %179 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %4, <2 x i32> %6)
-  %180 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6)
-  %181 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6)
-  %182 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6)
-  %183 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %4, <2 x i32> %6)
-  %184 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6)
-  %185 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6)
-  %186 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6)
-  %187 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %4, <2 x i32> %6)
-  %188 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %4, <2 x i32> %6)
-  %189 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %4, <2 x i32> %6)
-  %190 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %4, <2 x i32> %6)
-  %191 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %4, <2 x i32> %6)
-  %192 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %4, <2 x i32> %6)
-  %193 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %4, <2 x i32> %6)
-  %194 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %4, <2 x i32> %6)
-  %195 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %4, <2 x i32> %6)
-  %196 = fmul float %14, %124
-  %197 = fadd float %196, %125
-  %198 = call float @llvm.AMDIL.clamp.(float %163, float 0.000000e+00, float 1.000000e+00)
-  %199 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %200 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %201 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %202 = bitcast float %198 to i32
-  %203 = icmp ne i32 %202, 0
-  %. = select i1 %203, float -1.000000e+00, float 1.000000e+00
-  %204 = fsub float -0.000000e+00, %164
-  %205 = fadd float %44, %204
-  %206 = fsub float -0.000000e+00, %165
-  %207 = fadd float %45, %206
-  %208 = fsub float -0.000000e+00, %166
-  %209 = fadd float %46, %208
-  %210 = fmul float %205, %205
-  %211 = fmul float %207, %207
-  %212 = fadd float %211, %210
-  %213 = fmul float %209, %209
-  %214 = fadd float %212, %213
-  %215 = call float @llvm.AMDGPU.rsq.f32(float %214)
-  %216 = fmul float %205, %215
-  %217 = fmul float %207, %215
-  %218 = fmul float %209, %215
-  %219 = fmul float %., %54
-  %220 = fmul float %13, %47
-  %221 = fmul float %197, %48
-  %222 = bitcast float %174 to i32
-  %223 = bitcast float %175 to i32
-  %224 = insertelement <2 x i32> undef, i32 %222, i32 0
-  %225 = insertelement <2 x i32> %224, i32 %223, i32 1
-  %226 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %225, <32 x i8> %131, <16 x i8> %133, i32 2)
-  %227 = extractelement <4 x float> %226, i32 0
-  %228 = extractelement <4 x float> %226, i32 1
-  %229 = extractelement <4 x float> %226, i32 2
-  %230 = extractelement <4 x float> %226, i32 3
-  %231 = fmul float %227, 0x4012611180000000
-  %232 = fmul float %228, 0x4012611180000000
-  %233 = fmul float %229, 0x4012611180000000
-  %234 = call float @llvm.AMDGPU.lrp(float %27, float %231, float 1.000000e+00)
-  %235 = call float @llvm.AMDGPU.lrp(float %27, float %232, float 1.000000e+00)
-  %236 = call float @llvm.AMDGPU.lrp(float %27, float %233, float 1.000000e+00)
-  %237 = fmul float %216, %184
-  %238 = fmul float %217, %185
-  %239 = fadd float %238, %237
-  %240 = fmul float %218, %186
-  %241 = fadd float %239, %240
-  %242 = fmul float %216, %187
-  %243 = fmul float %217, %188
-  %244 = fadd float %243, %242
-  %245 = fmul float %218, %189
-  %246 = fadd float %244, %245
-  %247 = fmul float %216, %190
-  %248 = fmul float %217, %191
-  %249 = fadd float %248, %247
-  %250 = fmul float %218, %192
-  %251 = fadd float %249, %250
-  %252 = call float @llvm.AMDIL.clamp.(float %251, float 0.000000e+00, float 1.000000e+00)
-  %253 = fmul float %214, 0x3F5A36E2E0000000
-  %254 = call float @llvm.AMDIL.clamp.(float %253, float 0.000000e+00, float 1.000000e+00)
-  %255 = fsub float -0.000000e+00, %254
-  %256 = fadd float 1.000000e+00, %255
-  %257 = call float @llvm.pow.f32(float %252, float 2.500000e-01)
-  %258 = fmul float %39, %257
-  %259 = fmul float %241, %258
-  %260 = fmul float %246, %258
-  %261 = fmul float %259, %230
-  %262 = fmul float %260, %230
-  %263 = fadd float %252, 0x3EE4F8B580000000
-  %264 = fsub float -0.000000e+00, %252
-  %265 = fadd float 1.000000e+00, %264
-  %266 = fmul float 1.200000e+01, %265
-  %267 = fadd float %266, 4.000000e+00
-  %268 = fsub float -0.000000e+00, %267
-  %269 = fmul float %268, %263
-  %270 = fsub float -0.000000e+00, %267
-  %271 = fmul float %270, %263
-  %272 = fsub float -0.000000e+00, %267
-  %273 = fmul float %272, %263
-  %274 = fdiv float 1.000000e+00, %269
-  %275 = fdiv float 1.000000e+00, %271
-  %276 = fdiv float 1.000000e+00, %273
-  %277 = fmul float %261, %274
-  %278 = fmul float %262, %275
-  %279 = fmul float %263, %276
-  br label %LOOP
-
-LOOP:                                             ; preds = %LOOP, %main_body
-  %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %292, %LOOP ]
-  %temp168.0 = phi float [ %176, %main_body ], [ %288, %LOOP ]
-  %temp169.0 = phi float [ %177, %main_body ], [ %289, %LOOP ]
-  %temp170.0 = phi float [ %256, %main_body ], [ %290, %LOOP ]
-  %280 = bitcast float %temp168.0 to i32
-  %281 = bitcast float %temp169.0 to i32
-  %282 = insertelement <4 x i32> undef, i32 %280, i32 0
-  %283 = insertelement <4 x i32> %282, i32 %281, i32 1
-  %284 = insertelement <4 x i32> %283, i32 0, i32 2
-  %285 = insertelement <4 x i32> %284, i32 undef, i32 3
-  %286 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %285, <32 x i8> %147, <16 x i8> %149, i32 2)
-  %287 = extractelement <4 x float> %286, i32 3
-  %288 = fadd float %temp168.0, %277
-  %289 = fadd float %temp169.0, %278
-  %290 = fadd float %temp170.0, %279
-  %291 = fsub float -0.000000e+00, %287
-  %292 = fadd float %290, %291
-  %293 = fcmp oge float 0.000000e+00, %292
-  %294 = sext i1 %293 to i32
-  %295 = bitcast i32 %294 to float
-  %296 = bitcast float %295 to i32
-  %297 = icmp ne i32 %296, 0
-  br i1 %297, label %IF189, label %LOOP
-
-IF189:                                            ; preds = %LOOP
-  %298 = extractelement <4 x float> %286, i32 0
-  %299 = extractelement <4 x float> %286, i32 1
-  %300 = extractelement <4 x float> %286, i32 2
-  %301 = fsub float -0.000000e+00, %292
-  %302 = fadd float %temp144.0, %301
-  %303 = fdiv float 1.000000e+00, %302
-  %304 = fmul float %292, %303
-  %305 = fadd float %304, -1.000000e+00
-  %306 = fmul float %305, %277
-  %307 = fadd float %306, %288
-  %308 = fmul float %305, %278
-  %309 = fadd float %308, %289
-  %310 = fsub float -0.000000e+00, %176
-  %311 = fadd float %307, %310
-  %312 = fsub float -0.000000e+00, %177
-  %313 = fadd float %309, %312
-  %314 = fadd float %176, %311
-  %315 = fadd float %177, %313
-  %316 = fmul float %311, %67
-  %317 = fmul float %313, %68
-  %318 = fmul float %316, %55
-  %319 = fmul float %316, %56
-  %320 = fmul float %317, %57
-  %321 = fadd float %320, %318
-  %322 = fmul float %317, %58
-  %323 = fadd float %322, %319
-  %324 = fadd float %178, %321
-  %325 = fadd float %179, %323
-  %326 = fmul float %316, %59
-  %327 = fmul float %316, %60
-  %328 = fmul float %316, %61
-  %329 = fmul float %316, %62
-  %330 = fmul float %317, %63
-  %331 = fadd float %330, %326
-  %332 = fmul float %317, %64
-  %333 = fadd float %332, %327
-  %334 = fmul float %317, %65
-  %335 = fadd float %334, %328
-  %336 = fmul float %317, %66
-  %337 = fadd float %336, %329
-  %338 = fadd float %168, %331
-  %339 = fadd float %169, %333
-  %340 = fadd float %170, %335
-  %341 = fadd float %171, %337
-  %342 = bitcast float %338 to i32
-  %343 = bitcast float %339 to i32
-  %344 = insertelement <2 x i32> undef, i32 %342, i32 0
-  %345 = insertelement <2 x i32> %344, i32 %343, i32 1
-  %346 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %345, <32 x i8> %135, <16 x i8> %137, i32 2)
-  %347 = extractelement <4 x float> %346, i32 0
-  %348 = extractelement <4 x float> %346, i32 1
-  %349 = extractelement <4 x float> %346, i32 2
-  %350 = extractelement <4 x float> %346, i32 3
-  %351 = fmul float %347, %23
-  %352 = fmul float %348, %24
-  %353 = fmul float %349, %25
-  %354 = fmul float %350, %26
-  %355 = fmul float %351, %180
-  %356 = fmul float %352, %181
-  %357 = fmul float %353, %182
-  %358 = fmul float %354, %183
-  %359 = fsub float -0.000000e+00, %350
-  %360 = fadd float 1.000000e+00, %359
-  %361 = fmul float %360, %49
-  %362 = call float @llvm.AMDGPU.lrp(float %361, float %347, float %355)
-  %363 = call float @llvm.AMDGPU.lrp(float %361, float %348, float %356)
-  %364 = call float @llvm.AMDGPU.lrp(float %361, float %349, float %357)
-  %365 = bitcast float %340 to i32
-  %366 = bitcast float %341 to i32
-  %367 = insertelement <2 x i32> undef, i32 %365, i32 0
-  %368 = insertelement <2 x i32> %367, i32 %366, i32 1
-  %369 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %368, <32 x i8> %151, <16 x i8> %153, i32 2)
-  %370 = extractelement <4 x float> %369, i32 2
-  %371 = fmul float %362, %234
-  %372 = fmul float %363, %235
-  %373 = fmul float %364, %236
-  %374 = fmul float %358, %230
-  %375 = bitcast float %314 to i32
-  %376 = bitcast float %315 to i32
-  %377 = insertelement <2 x i32> undef, i32 %375, i32 0
-  %378 = insertelement <2 x i32> %377, i32 %376, i32 1
-  %379 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %378, <32 x i8> %139, <16 x i8> %141, i32 2)
-  %380 = extractelement <4 x float> %379, i32 0
-  %381 = extractelement <4 x float> %379, i32 1
-  %382 = extractelement <4 x float> %379, i32 2
-  %383 = extractelement <4 x float> %379, i32 3
-  %384 = fcmp olt float 0.000000e+00, %382
-  %385 = sext i1 %384 to i32
-  %386 = bitcast i32 %385 to float
-  %387 = bitcast float %386 to i32
-  %388 = icmp ne i32 %387, 0
-  %.224 = select i1 %388, float %381, float %380
-  %.225 = select i1 %388, float %383, float %381
-  %389 = bitcast float %324 to i32
-  %390 = bitcast float %325 to i32
-  %391 = insertelement <2 x i32> undef, i32 %389, i32 0
-  %392 = insertelement <2 x i32> %391, i32 %390, i32 1
-  %393 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %392, <32 x i8> %143, <16 x i8> %145, i32 2)
-  %394 = extractelement <4 x float> %393, i32 0
-  %395 = extractelement <4 x float> %393, i32 1
-  %396 = extractelement <4 x float> %393, i32 2
-  %397 = extractelement <4 x float> %393, i32 3
-  %398 = fcmp olt float 0.000000e+00, %396
-  %399 = sext i1 %398 to i32
-  %400 = bitcast i32 %399 to float
-  %401 = bitcast float %400 to i32
-  %402 = icmp ne i32 %401, 0
-  %temp112.1 = select i1 %402, float %395, float %394
-  %temp113.1 = select i1 %402, float %397, float %395
-  %403 = fmul float %.224, 2.000000e+00
-  %404 = fadd float %403, -1.000000e+00
-  %405 = fmul float %.225, 2.000000e+00
-  %406 = fadd float %405, -1.000000e+00
-  %407 = fmul float %temp112.1, 2.000000e+00
-  %408 = fadd float %407, -1.000000e+00
-  %409 = fmul float %temp113.1, 2.000000e+00
-  %410 = fadd float %409, -1.000000e+00
-  %411 = fsub float -0.000000e+00, %404
-  %412 = fmul float %411, %35
-  %413 = fsub float -0.000000e+00, %406
-  %414 = fmul float %413, %35
-  %415 = fsub float -0.000000e+00, %408
-  %416 = fmul float %415, %36
-  %417 = fsub float -0.000000e+00, %410
-  %418 = fmul float %417, %36
-  %419 = fmul float %416, %370
-  %420 = fmul float %418, %370
-  %421 = call float @fabs(float %412)
-  %422 = call float @fabs(float %414)
-  %423 = fsub float -0.000000e+00, %421
-  %424 = fadd float 1.000000e+00, %423
-  %425 = fsub float -0.000000e+00, %422
-  %426 = fadd float 1.000000e+00, %425
-  %427 = fmul float %424, %419
-  %428 = fadd float %427, %412
-  %429 = fmul float %426, %420
-  %430 = fadd float %429, %414
-  %431 = fmul float %428, %428
-  %432 = fmul float %430, %430
-  %433 = fadd float %431, %432
-  %434 = fsub float -0.000000e+00, %433
-  %435 = fadd float 0x3FF00068E0000000, %434
-  %436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00)
-  %437 = call float @llvm.AMDGPU.rsq.f32(float %436)
-  %438 = fmul float %437, %436
-  %439 = fsub float -0.000000e+00, %436
-  %440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00)
-  %441 = fmul float %184, %428
-  %442 = fmul float %185, %428
-  %443 = fmul float %186, %428
-  %444 = fmul float %187, %430
-  %445 = fadd float %444, %441
-  %446 = fmul float %188, %430
-  %447 = fadd float %446, %442
-  %448 = fmul float %189, %430
-  %449 = fadd float %448, %443
-  %450 = fmul float %190, %440
-  %451 = fadd float %450, %445
-  %452 = fmul float %191, %440
-  %453 = fadd float %452, %447
-  %454 = fmul float %192, %440
-  %455 = fadd float %454, %449
-  %456 = fmul float %451, %451
-  %457 = fmul float %453, %453
-  %458 = fadd float %457, %456
-  %459 = fmul float %455, %455
-  %460 = fadd float %458, %459
-  %461 = call float @llvm.AMDGPU.rsq.f32(float %460)
-  %462 = fmul float %451, %461
-  %463 = fmul float %453, %461
-  %464 = fmul float %455, %461
-  %465 = fcmp olt float 0.000000e+00, %219
-  %466 = sext i1 %465 to i32
-  %467 = bitcast i32 %466 to float
-  %468 = bitcast float %467 to i32
-  %469 = icmp ne i32 %468, 0
-  br i1 %469, label %IF198, label %ENDIF197
-
-IF198:                                            ; preds = %IF189
-  %470 = fsub float -0.000000e+00, %462
-  %471 = fsub float -0.000000e+00, %463
-  %472 = fsub float -0.000000e+00, %464
-  br label %ENDIF197
-
-ENDIF197:                                         ; preds = %IF189, %IF198
-  %temp14.0 = phi float [ %472, %IF198 ], [ %464, %IF189 ]
-  %temp13.0 = phi float [ %471, %IF198 ], [ %463, %IF189 ]
-  %temp12.0 = phi float [ %470, %IF198 ], [ %462, %IF189 ]
-  %473 = bitcast float %220 to i32
-  %474 = bitcast float %221 to i32
-  %475 = insertelement <2 x i32> undef, i32 %473, i32 0
-  %476 = insertelement <2 x i32> %475, i32 %474, i32 1
-  %477 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %476, <32 x i8> %159, <16 x i8> %161, i32 2)
-  %478 = extractelement <4 x float> %477, i32 0
-  %479 = extractelement <4 x float> %477, i32 1
-  %480 = extractelement <4 x float> %477, i32 2
-  %481 = extractelement <4 x float> %477, i32 3
-  %482 = fmul float %478, %40
-  %483 = fadd float %482, %41
-  %484 = fmul float %479, %40
-  %485 = fadd float %484, %41
-  %486 = fmul float %480, %40
-  %487 = fadd float %486, %41
-  %488 = fmul float %481, %42
-  %489 = fadd float %488, %43
-  %490 = bitcast float %172 to i32
-  %491 = bitcast float %173 to i32
-  %492 = insertelement <2 x i32> undef, i32 %490, i32 0
-  %493 = insertelement <2 x i32> %492, i32 %491, i32 1
-  %494 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %493, <32 x i8> %155, <16 x i8> %157, i32 2)
-  %495 = extractelement <4 x float> %494, i32 0
-  %496 = extractelement <4 x float> %494, i32 1
-  %497 = extractelement <4 x float> %494, i32 2
-  %498 = extractelement <4 x float> %494, i32 3
-  %499 = fmul float %498, 3.200000e+01
-  %500 = fadd float %499, -1.600000e+01
-  %501 = call float @llvm.AMDIL.exp.(float %500)
-  %502 = fmul float %495, %501
-  %503 = fmul float %496, %501
-  %504 = fmul float %497, %501
-  %505 = fmul float %28, %502
-  %506 = fadd float %505, %193
-  %507 = fmul float %29, %503
-  %508 = fadd float %507, %194
-  %509 = fmul float %30, %504
-  %510 = fadd float %509, %195
-  %511 = fmul float %506, %489
-  %512 = fmul float %508, %489
-  %513 = fmul float %510, %489
-  %514 = fmul float %489, 5.000000e-01
-  %515 = fadd float %514, 5.000000e-01
-  %516 = fmul float %483, %515
-  %517 = fadd float %516, %511
-  %518 = fmul float %485, %515
-  %519 = fadd float %518, %512
-  %520 = fmul float %487, %515
-  %521 = fadd float %520, %513
-  %522 = fmul float %517, %371
-  %523 = fmul float %519, %372
-  %524 = fmul float %521, %373
-  %525 = fmul float %428, 0x3FDB272440000000
-  %526 = fmul float %430, 0xBFDB272440000000
-  %527 = fadd float %526, %525
-  %528 = fmul float %440, 0x3FE99999A0000000
-  %529 = fadd float %527, %528
-  %530 = fmul float %529, 5.000000e-01
-  %531 = fadd float %530, 0x3FE3333340000000
-  %532 = fmul float %531, %531
-  %533 = fmul float %522, %532
-  %534 = fmul float %523, %532
-  %535 = fmul float %524, %532
-  %536 = fsub float -0.000000e+00, %72
-  %537 = fsub float -0.000000e+00, %73
-  %538 = fsub float -0.000000e+00, %74
-  %539 = fmul float %temp12.0, %536
-  %540 = fmul float %temp13.0, %537
-  %541 = fadd float %540, %539
-  %542 = fmul float %temp14.0, %538
-  %543 = fadd float %541, %542
-  %544 = call float @llvm.AMDIL.clamp.(float %543, float 0.000000e+00, float 1.000000e+00)
-  %545 = fmul float %371, %544
-  %546 = fmul float %372, %544
-  %547 = fmul float %373, %544
-  %548 = fmul float %545, %69
-  %549 = fmul float %546, %70
-  %550 = fmul float %547, %71
-  %551 = fsub float -0.000000e+00, %164
-  %552 = fadd float %97, %551
-  %553 = fsub float -0.000000e+00, %165
-  %554 = fadd float %98, %553
-  %555 = fsub float -0.000000e+00, %166
-  %556 = fadd float %99, %555
-  %557 = fmul float %552, %552
-  %558 = fmul float %554, %554
-  %559 = fadd float %558, %557
-  %560 = fmul float %556, %556
-  %561 = fadd float %559, %560
-  %562 = call float @llvm.AMDGPU.rsq.f32(float %561)
-  %563 = fmul float %562, %561
-  %564 = fsub float -0.000000e+00, %561
-  %565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00)
-  %566 = fsub float -0.000000e+00, %84
-  %567 = fadd float %565, %566
-  %568 = fsub float -0.000000e+00, %83
-  %569 = fadd float %565, %568
-  %570 = fsub float -0.000000e+00, %82
-  %571 = fadd float %565, %570
-  %572 = fsub float -0.000000e+00, %84
-  %573 = fadd float %83, %572
-  %574 = fsub float -0.000000e+00, %83
-  %575 = fadd float %82, %574
-  %576 = fsub float -0.000000e+00, %82
-  %577 = fadd float %81, %576
-  %578 = fdiv float 1.000000e+00, %573
-  %579 = fdiv float 1.000000e+00, %575
-  %580 = fdiv float 1.000000e+00, %577
-  %581 = fmul float %567, %578
-  %582 = fmul float %569, %579
-  %583 = fmul float %571, %580
-  %584 = fcmp olt float %565, %83
-  %585 = sext i1 %584 to i32
-  %586 = bitcast i32 %585 to float
-  %587 = bitcast float %586 to i32
-  %588 = icmp ne i32 %587, 0
-  br i1 %588, label %ENDIF200, label %ELSE202
-
-ELSE202:                                          ; preds = %ENDIF197
-  %589 = fcmp olt float %565, %82
-  %590 = sext i1 %589 to i32
-  %591 = bitcast i32 %590 to float
-  %592 = bitcast float %591 to i32
-  %593 = icmp ne i32 %592, 0
-  br i1 %593, label %ENDIF200, label %ELSE205
-
-ENDIF200:                                         ; preds = %ELSE205, %ELSE202, %ENDIF197
-  %temp80.0 = phi float [ %581, %ENDIF197 ], [ %.226, %ELSE205 ], [ %582, %ELSE202 ]
-  %temp88.0 = phi float [ %122, %ENDIF197 ], [ %.227, %ELSE205 ], [ %120, %ELSE202 ]
-  %temp89.0 = phi float [ %123, %ENDIF197 ], [ %.228, %ELSE205 ], [ %121, %ELSE202 ]
-  %temp90.0 = phi float [ %120, %ENDIF197 ], [ %116, %ELSE205 ], [ %118, %ELSE202 ]
-  %temp91.0 = phi float [ %121, %ENDIF197 ], [ %117, %ELSE205 ], [ %119, %ELSE202 ]
-  %594 = fcmp olt float %565, %83
-  %595 = sext i1 %594 to i32
-  %596 = bitcast i32 %595 to float
-  %597 = bitcast float %596 to i32
-  %598 = icmp ne i32 %597, 0
-  br i1 %598, label %ENDIF209, label %ELSE211
-
-ELSE205:                                          ; preds = %ELSE202
-  %599 = fcmp olt float %565, %81
-  %600 = sext i1 %599 to i32
-  %601 = bitcast i32 %600 to float
-  %602 = bitcast float %601 to i32
-  %603 = icmp ne i32 %602, 0
-  %.226 = select i1 %603, float %583, float 1.000000e+00
-  %.227 = select i1 %603, float %118, float %116
-  %.228 = select i1 %603, float %119, float %117
-  br label %ENDIF200
-
-ELSE211:                                          ; preds = %ENDIF200
-  %604 = fcmp olt float %565, %82
-  %605 = sext i1 %604 to i32
-  %606 = bitcast i32 %605 to float
-  %607 = bitcast float %606 to i32
-  %608 = icmp ne i32 %607, 0
-  br i1 %608, label %ENDIF209, label %ELSE214
-
-ENDIF209:                                         ; preds = %ELSE214, %ELSE211, %ENDIF200
-  %temp52.0 = phi float [ %108, %ENDIF200 ], [ %100, %ELSE214 ], [ %104, %ELSE211 ]
-  %temp53.0 = phi float [ %109, %ENDIF200 ], [ %101, %ELSE214 ], [ %105, %ELSE211 ]
-  %temp54.0 = phi float [ %110, %ENDIF200 ], [ %102, %ELSE214 ], [ %106, %ELSE211 ]
-  %temp55.0 = phi float [ %111, %ENDIF200 ], [ %103, %ELSE214 ], [ %107, %ELSE211 ]
-  %temp68.0 = phi float [ %112, %ENDIF200 ], [ %.230, %ELSE214 ], [ %108, %ELSE211 ]
-  %temp69.0 = phi float [ %113, %ENDIF200 ], [ %.231, %ELSE214 ], [ %109, %ELSE211 ]
-  %temp70.0 = phi float [ %114, %ENDIF200 ], [ %.232, %ELSE214 ], [ %110, %ELSE211 ]
-  %temp71.0 = phi float [ %115, %ENDIF200 ], [ %.233, %ELSE214 ], [ %111, %ELSE211 ]
-  %609 = fmul float %164, %85
-  %610 = fmul float %165, %86
-  %611 = fadd float %609, %610
-  %612 = fmul float %166, %87
-  %613 = fadd float %611, %612
-  %614 = fmul float %167, %88
-  %615 = fadd float %613, %614
-  %616 = fmul float %164, %89
-  %617 = fmul float %165, %90
-  %618 = fadd float %616, %617
-  %619 = fmul float %166, %91
-  %620 = fadd float %618, %619
-  %621 = fmul float %167, %92
-  %622 = fadd float %620, %621
-  %623 = fmul float %164, %93
-  %624 = fmul float %165, %94
-  %625 = fadd float %623, %624
-  %626 = fmul float %166, %95
-  %627 = fadd float %625, %626
-  %628 = fmul float %167, %96
-  %629 = fadd float %627, %628
-  %630 = fsub float -0.000000e+00, %78
-  %631 = fadd float 1.000000e+00, %630
-  %632 = call float @fabs(float %615)
-  %633 = call float @fabs(float %622)
-  %634 = fcmp oge float %631, %632
-  %635 = sext i1 %634 to i32
-  %636 = bitcast i32 %635 to float
-  %637 = bitcast float %636 to i32
-  %638 = and i32 %637, 1065353216
-  %639 = bitcast i32 %638 to float
-  %640 = fcmp oge float %631, %633
-  %641 = sext i1 %640 to i32
-  %642 = bitcast i32 %641 to float
-  %643 = bitcast float %642 to i32
-  %644 = and i32 %643, 1065353216
-  %645 = bitcast i32 %644 to float
-  %646 = fmul float %639, %645
-  %647 = fmul float %629, %646
-  %648 = fmul float %615, %temp68.0
-  %649 = fadd float %648, %temp70.0
-  %650 = fmul float %622, %temp69.0
-  %651 = fadd float %650, %temp71.0
-  %652 = fmul float %615, %temp52.0
-  %653 = fadd float %652, %temp54.0
-  %654 = fmul float %622, %temp53.0
-  %655 = fadd float %654, %temp55.0
-  %656 = fadd float %temp80.0, -1.000000e+00
-  %657 = fmul float %656, %77
-  %658 = fadd float %657, 1.000000e+00
-  %659 = call float @llvm.AMDIL.clamp.(float %658, float 0.000000e+00, float 1.000000e+00)
-  %660 = bitcast float %649 to i32
-  %661 = bitcast float %651 to i32
-  %662 = bitcast float 0.000000e+00 to i32
-  %663 = insertelement <4 x i32> undef, i32 %660, i32 0
-  %664 = insertelement <4 x i32> %663, i32 %661, i32 1
-  %665 = insertelement <4 x i32> %664, i32 %662, i32 2
-  %666 = insertelement <4 x i32> %665, i32 undef, i32 3
-  %667 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %666, <32 x i8> %127, <16 x i8> %129, i32 2)
-  %668 = extractelement <4 x float> %667, i32 0
-  %669 = extractelement <4 x float> %667, i32 1
-  %670 = bitcast float %653 to i32
-  %671 = bitcast float %655 to i32
-  %672 = bitcast float 0.000000e+00 to i32
-  %673 = insertelement <4 x i32> undef, i32 %670, i32 0
-  %674 = insertelement <4 x i32> %673, i32 %671, i32 1
-  %675 = insertelement <4 x i32> %674, i32 %672, i32 2
-  %676 = insertelement <4 x i32> %675, i32 undef, i32 3
-  %677 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %676, <32 x i8> %127, <16 x i8> %129, i32 2)
-  %678 = extractelement <4 x float> %677, i32 0
-  %679 = extractelement <4 x float> %677, i32 1
-  %680 = fsub float -0.000000e+00, %669
-  %681 = fadd float 1.000000e+00, %680
-  %682 = fsub float -0.000000e+00, %679
-  %683 = fadd float 1.000000e+00, %682
-  %684 = fmul float %681, 2.500000e-01
-  %685 = fmul float %683, 2.500000e-01
-  %686 = fsub float -0.000000e+00, %684
-  %687 = fadd float %668, %686
-  %688 = fsub float -0.000000e+00, %685
-  %689 = fadd float %678, %688
-  %690 = fmul float %647, %temp88.0
-  %691 = fadd float %690, %temp89.0
-  %692 = fmul float %647, %temp90.0
-  %693 = fadd float %692, %temp91.0
-  %694 = call float @llvm.AMDIL.clamp.(float %691, float 0.000000e+00, float 1.000000e+00)
-  %695 = call float @llvm.AMDIL.clamp.(float %693, float 0.000000e+00, float 1.000000e+00)
-  %696 = fsub float -0.000000e+00, %694
-  %697 = fadd float %668, %696
-  %698 = fsub float -0.000000e+00, %695
-  %699 = fadd float %678, %698
-  %700 = fmul float %668, %668
-  %701 = fmul float %678, %678
-  %702 = fsub float -0.000000e+00, %700
-  %703 = fadd float %687, %702
-  %704 = fsub float -0.000000e+00, %701
-  %705 = fadd float %689, %704
-  %706 = fcmp uge float %703, %75
-  %707 = select i1 %706, float %703, float %75
-  %708 = fcmp uge float %705, %75
-  %709 = select i1 %708, float %705, float %75
-  %710 = fmul float %697, %697
-  %711 = fadd float %710, %707
-  %712 = fmul float %699, %699
-  %713 = fadd float %712, %709
-  %714 = fdiv float 1.000000e+00, %711
-  %715 = fdiv float 1.000000e+00, %713
-  %716 = fmul float %707, %714
-  %717 = fmul float %709, %715
-  %718 = fcmp oge float %697, 0.000000e+00
-  %719 = sext i1 %718 to i32
-  %720 = bitcast i32 %719 to float
-  %721 = bitcast float %720 to i32
-  %722 = icmp ne i32 %721, 0
-  %.229 = select i1 %722, float 1.000000e+00, float %716
-  %723 = fcmp oge float %699, 0.000000e+00
-  %724 = sext i1 %723 to i32
-  %725 = bitcast i32 %724 to float
-  %726 = bitcast float %725 to i32
-  %727 = icmp ne i32 %726, 0
-  %temp28.0 = select i1 %727, float 1.000000e+00, float %717
-  %728 = call float @llvm.AMDGPU.lrp(float %659, float %temp28.0, float %.229)
-  %729 = call float @llvm.pow.f32(float %728, float %76)
-  %730 = fmul float %729, %79
-  %731 = fadd float %730, %80
-  %732 = call float @llvm.AMDIL.clamp.(float %731, float 0.000000e+00, float 1.000000e+00)
-  %733 = fmul float %732, %732
-  %734 = fmul float 2.000000e+00, %732
-  %735 = fsub float -0.000000e+00, %734
-  %736 = fadd float 3.000000e+00, %735
-  %737 = fmul float %733, %736
-  %738 = fmul float %548, %737
-  %739 = fmul float %549, %737
-  %740 = fmul float %550, %737
-  %741 = fmul float %738, %515
-  %742 = fadd float %741, %533
-  %743 = fmul float %739, %515
-  %744 = fadd float %743, %534
-  %745 = fmul float %740, %515
-  %746 = fadd float %745, %535
-  %747 = call float @llvm.AMDGPU.lrp(float %230, float %287, float 1.000000e+00)
-  %748 = call float @llvm.AMDGPU.lrp(float %37, float %298, float 1.000000e+00)
-  %749 = call float @llvm.AMDGPU.lrp(float %37, float %299, float 1.000000e+00)
-  %750 = call float @llvm.AMDGPU.lrp(float %37, float %300, float 1.000000e+00)
-  %751 = call float @llvm.AMDGPU.lrp(float %38, float %747, float 1.000000e+00)
-  %752 = fmul float %748, %751
-  %753 = fmul float %749, %751
-  %754 = fmul float %750, %751
-  %755 = fmul float %742, %752
-  %756 = fmul float %744, %753
-  %757 = fmul float %746, %754
-  %758 = fmul float %temp12.0, %216
-  %759 = fmul float %temp13.0, %217
-  %760 = fadd float %759, %758
-  %761 = fmul float %temp14.0, %218
-  %762 = fadd float %760, %761
-  %763 = call float @fabs(float %762)
-  %764 = fmul float %763, %763
-  %765 = fmul float %764, %50
-  %766 = fadd float %765, %51
-  %767 = call float @llvm.AMDIL.clamp.(float %766, float 0.000000e+00, float 1.000000e+00)
-  %768 = fsub float -0.000000e+00, %767
-  %769 = fadd float 1.000000e+00, %768
-  %770 = fmul float %33, %769
-  %771 = fmul float %33, %769
-  %772 = fmul float %33, %769
-  %773 = fmul float %34, %769
-  %774 = call float @llvm.AMDGPU.lrp(float %770, float %31, float %755)
-  %775 = call float @llvm.AMDGPU.lrp(float %771, float %31, float %756)
-  %776 = call float @llvm.AMDGPU.lrp(float %772, float %31, float %757)
-  %777 = call float @llvm.AMDGPU.lrp(float %773, float %32, float %374)
-  %778 = fcmp uge float %774, 0x3E6FFFFE60000000
-  %779 = select i1 %778, float %774, float 0x3E6FFFFE60000000
-  %780 = fcmp uge float %775, 0x3E6FFFFE60000000
-  %781 = select i1 %780, float %775, float 0x3E6FFFFE60000000
-  %782 = fcmp uge float %776, 0x3E6FFFFE60000000
-  %783 = select i1 %782, float %776, float 0x3E6FFFFE60000000
-  %784 = fcmp uge float %779, 6.550400e+04
-  %785 = select i1 %784, float 6.550400e+04, float %779
-  %786 = fcmp uge float %781, 6.550400e+04
-  %787 = select i1 %786, float 6.550400e+04, float %781
-  %788 = fcmp uge float %783, 6.550400e+04
-  %789 = select i1 %788, float 6.550400e+04, float %783
-  %790 = fmul float %777, %52
-  %791 = fadd float %790, %53
-  %792 = call float @llvm.AMDIL.clamp.(float %791, float 0.000000e+00, float 1.000000e+00)
-  %793 = call i32 @llvm.SI.packf16(float %785, float %787)
-  %794 = bitcast i32 %793 to float
-  %795 = call i32 @llvm.SI.packf16(float %789, float %792)
-  %796 = bitcast i32 %795 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %794, float %796, float %794, float %796)
-  ret void
-
-ELSE214:                                          ; preds = %ELSE211
-  %797 = fcmp olt float %565, %81
-  %798 = sext i1 %797 to i32
-  %799 = bitcast i32 %798 to float
-  %800 = bitcast float %799 to i32
-  %801 = icmp ne i32 %800, 0
-  %.230 = select i1 %801, float %104, float %100
-  %.231 = select i1 %801, float %105, float %101
-  %.232 = select i1 %801, float %106, float %102
-  %.233 = select i1 %801, float %107, float %103
-  br label %ENDIF209
-}
-
-; Function Attrs: readnone
-declare float @llvm.AMDIL.clamp.(float, float, float) #2
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.lrp(float, float, float) #2
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.samplel.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.cndlt(float, float, float) #2
-
-; Function Attrs: readnone
-declare float @llvm.AMDIL.exp.(float) #2
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
-attributes #3 = { nounwind readonly }
-attributes #4 = { readonly }
diff --git a/test/CodeGen/R600/si-spill-cf.ll b/test/CodeGen/R600/si-spill-cf.ll
deleted file mode 100644
index 4b2d8ec6bf0..00000000000
--- a/test/CodeGen/R600/si-spill-cf.ll
+++ /dev/null
@@ -1,501 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s
-
-; If this occurs it is likely due to reordering and the restore was
-; originally supposed to happen before SI_END_CF.
-; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]]
-; SI-NOT: v_readlane_b32 [[SAVED]]
-
-define void @main() #0 {
-main_body:
-  %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
-  %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
-  %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80)
-  %3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84)
-  %4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88)
-  %5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
-  %6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100)
-  %7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104)
-  %8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112)
-  %9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116)
-  %10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120)
-  %11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128)
-  %12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132)
-  %13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136)
-  %14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144)
-  %15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148)
-  %16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152)
-  %17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160)
-  %18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164)
-  %19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168)
-  %20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176)
-  %21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180)
-  %22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184)
-  %23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192)
-  %24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196)
-  %25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200)
-  %26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208)
-  %27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212)
-  %28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216)
-  %29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224)
-  %30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228)
-  %31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232)
-  %32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240)
-  %33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244)
-  %34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248)
-  %35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256)
-  %36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260)
-  %37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264)
-  %38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272)
-  %39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276)
-  %40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280)
-  %41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288)
-  %42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292)
-  %43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296)
-  %44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304)
-  %45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308)
-  %46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312)
-  %47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320)
-  %48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324)
-  %49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328)
-  %50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336)
-  %51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340)
-  %52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344)
-  %53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352)
-  %54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356)
-  %55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360)
-  %56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368)
-  %57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372)
-  %58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376)
-  %59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384)
-  %60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388)
-  %61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392)
-  %62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400)
-  %63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404)
-  %64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408)
-  %65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416)
-  %66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420)
-  br label %LOOP
-
-LOOP:                                             ; preds = %ENDIF2795, %main_body
-  %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ]
-  %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ]
-  %67 = icmp sgt i32 undef, 4
-  br i1 %67, label %ENDLOOP, label %ENDIF
-
-ENDLOOP:                                          ; preds = %ELSE2566, %LOOP
-  %68 = call float @llvm.AMDGPU.lrp(float %0, float undef, float undef)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %68, float undef, float 1.000000e+00)
-  ret void
-
-ENDIF:                                            ; preds = %LOOP
-  %69 = fsub float %2, undef
-  %70 = fsub float %3, undef
-  %71 = fsub float %4, undef
-  %72 = fmul float %69, 0.000000e+00
-  %73 = fmul float %70, undef
-  %74 = fmul float %71, undef
-  %75 = fsub float %6, undef
-  %76 = fsub float %7, undef
-  %77 = fmul float %75, undef
-  %78 = fmul float %76, 0.000000e+00
-  %79 = call float @llvm.minnum.f32(float %74, float %78)
-  %80 = call float @llvm.maxnum.f32(float %72, float 0.000000e+00)
-  %81 = call float @llvm.maxnum.f32(float %73, float %77)
-  %82 = call float @llvm.maxnum.f32(float undef, float %79)
-  %83 = call float @llvm.minnum.f32(float %80, float %81)
-  %84 = call float @llvm.minnum.f32(float %83, float undef)
-  %85 = fsub float %14, undef
-  %86 = fsub float %15, undef
-  %87 = fsub float %16, undef
-  %88 = fmul float %85, undef
-  %89 = fmul float %86, undef
-  %90 = fmul float %87, undef
-  %91 = fsub float %17, undef
-  %92 = fsub float %18, undef
-  %93 = fsub float %19, undef
-  %94 = fmul float %91, 0.000000e+00
-  %95 = fmul float %92, undef
-  %96 = fmul float %93, undef
-  %97 = call float @llvm.minnum.f32(float %89, float %95)
-  %98 = call float @llvm.maxnum.f32(float %88, float %94)
-  %99 = call float @llvm.maxnum.f32(float %90, float %96)
-  %100 = call float @llvm.maxnum.f32(float undef, float %97)
-  %101 = call float @llvm.maxnum.f32(float %100, float undef)
-  %102 = call float @llvm.minnum.f32(float %98, float undef)
-  %103 = call float @llvm.minnum.f32(float %102, float %99)
-  %104 = fsub float %30, undef
-  %105 = fsub float %31, undef
-  %106 = fmul float %104, 0.000000e+00
-  %107 = fmul float %105, 0.000000e+00
-  %108 = call float @llvm.minnum.f32(float undef, float %106)
-  %109 = call float @llvm.maxnum.f32(float undef, float %107)
-  %110 = call float @llvm.maxnum.f32(float undef, float %108)
-  %111 = call float @llvm.maxnum.f32(float %110, float undef)
-  %112 = call float @llvm.minnum.f32(float undef, float %109)
-  %113 = fsub float %32, undef
-  %114 = fsub float %33, undef
-  %115 = fsub float %34, undef
-  %116 = fmul float %113, 0.000000e+00
-  %117 = fmul float %114, undef
-  %118 = fmul float %115, undef
-  %119 = fsub float %35, undef
-  %120 = fsub float %36, undef
-  %121 = fsub float %37, undef
-  %122 = fmul float %119, undef
-  %123 = fmul float %120, undef
-  %124 = fmul float %121, undef
-  %125 = call float @llvm.minnum.f32(float %116, float %122)
-  %126 = call float @llvm.minnum.f32(float %117, float %123)
-  %127 = call float @llvm.minnum.f32(float %118, float %124)
-  %128 = call float @llvm.maxnum.f32(float %125, float %126)
-  %129 = call float @llvm.maxnum.f32(float %128, float %127)
-  %130 = fsub float %38, undef
-  %131 = fsub float %39, undef
-  %132 = fsub float %40, undef
-  %133 = fmul float %130, 0.000000e+00
-  %134 = fmul float %131, undef
-  %135 = fmul float %132, undef
-  %136 = fsub float %41, undef
-  %137 = fsub float %42, undef
-  %138 = fsub float %43, undef
-  %139 = fmul float %136, undef
-  %140 = fmul float %137, undef
-  %141 = fmul float %138, undef
-  %142 = call float @llvm.minnum.f32(float %133, float %139)
-  %143 = call float @llvm.minnum.f32(float %134, float %140)
-  %144 = call float @llvm.minnum.f32(float %135, float %141)
-  %145 = call float @llvm.maxnum.f32(float %142, float %143)
-  %146 = call float @llvm.maxnum.f32(float %145, float %144)
-  %147 = fsub float %44, undef
-  %148 = fsub float %45, undef
-  %149 = fsub float %46, undef
-  %150 = fmul float %147, 0.000000e+00
-  %151 = fmul float %148, 0.000000e+00
-  %152 = fmul float %149, undef
-  %153 = fsub float %47, undef
-  %154 = fsub float %48, undef
-  %155 = fsub float %49, undef
-  %156 = fmul float %153, undef
-  %157 = fmul float %154, 0.000000e+00
-  %158 = fmul float %155, undef
-  %159 = call float @llvm.minnum.f32(float %150, float %156)
-  %160 = call float @llvm.minnum.f32(float %151, float %157)
-  %161 = call float @llvm.minnum.f32(float %152, float %158)
-  %162 = call float @llvm.maxnum.f32(float %159, float %160)
-  %163 = call float @llvm.maxnum.f32(float %162, float %161)
-  %164 = fsub float %50, undef
-  %165 = fsub float %51, undef
-  %166 = fsub float %52, undef
-  %167 = fmul float %164, undef
-  %168 = fmul float %165, 0.000000e+00
-  %169 = fmul float %166, 0.000000e+00
-  %170 = fsub float %53, undef
-  %171 = fsub float %54, undef
-  %172 = fsub float %55, undef
-  %173 = fdiv float 1.000000e+00, %temp18.0
-  %174 = fmul float %170, undef
-  %175 = fmul float %171, undef
-  %176 = fmul float %172, %173
-  %177 = call float @llvm.minnum.f32(float %167, float %174)
-  %178 = call float @llvm.minnum.f32(float %168, float %175)
-  %179 = call float @llvm.minnum.f32(float %169, float %176)
-  %180 = call float @llvm.maxnum.f32(float %177, float %178)
-  %181 = call float @llvm.maxnum.f32(float %180, float %179)
-  %182 = fsub float %62, undef
-  %183 = fsub float %63, undef
-  %184 = fsub float %64, undef
-  %185 = fmul float %182, 0.000000e+00
-  %186 = fmul float %183, undef
-  %187 = fmul float %184, undef
-  %188 = fsub float %65, undef
-  %189 = fsub float %66, undef
-  %190 = fmul float %188, undef
-  %191 = fmul float %189, undef
-  %192 = call float @llvm.maxnum.f32(float %185, float %190)
-  %193 = call float @llvm.maxnum.f32(float %186, float %191)
-  %194 = call float @llvm.maxnum.f32(float %187, float undef)
-  %195 = call float @llvm.minnum.f32(float %192, float %193)
-  %196 = call float @llvm.minnum.f32(float %195, float %194)
-  %.temp292.7 = select i1 undef, float %163, float undef
-  %temp292.9 = select i1 false, float %181, float %.temp292.7
-  %.temp292.9 = select i1 undef, float undef, float %temp292.9
-  %197 = fcmp ogt float undef, 0.000000e+00
-  %198 = fcmp olt float undef, %196
-  %199 = and i1 %197, %198
-  %200 = fcmp olt float undef, %.temp292.9
-  %201 = and i1 %199, %200
-  %temp292.11 = select i1 %201, float undef, float %.temp292.9
-  br i1 undef, label %IF2565, label %ELSE2566
-
-IF2565:                                           ; preds = %ENDIF
-  br i1 false, label %ENDIF2582, label %ELSE2584
-
-ELSE2566:                                         ; preds = %ENDIF
-  %202 = fcmp oeq float %temp292.11, 1.000000e+04
-  br i1 %202, label %ENDLOOP, label %ELSE2593
-
-ENDIF2564:                                        ; preds = %ENDIF2594, %ENDIF2588
-  %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ]
-  %temp18.1 = phi float [ %219, %ENDIF2588 ], [ undef, %ENDIF2594 ]
-  %203 = fsub float %5, undef
-  %204 = fmul float %203, undef
-  %205 = call float @llvm.maxnum.f32(float undef, float %204)
-  %206 = call float @llvm.minnum.f32(float %205, float undef)
-  %207 = call float @llvm.minnum.f32(float %206, float undef)
-  %208 = fcmp ogt float undef, 0.000000e+00
-  %209 = fcmp olt float undef, 1.000000e+00
-  %210 = and i1 %208, %209
-  %211 = fcmp olt float undef, %207
-  %212 = and i1 %210, %211
-  br i1 %212, label %ENDIF2795, label %ELSE2797
-
-ELSE2584:                                         ; preds = %IF2565
-  br label %ENDIF2582
-
-ENDIF2582:                                        ; preds = %ELSE2584, %IF2565
-  %213 = fadd float %1, undef
-  %214 = fadd float 0.000000e+00, %213
-  %215 = call float @llvm.AMDIL.fraction.(float %214)
-  br i1 undef, label %IF2589, label %ELSE2590
-
-IF2589:                                           ; preds = %ENDIF2582
-  br label %ENDIF2588
-
-ELSE2590:                                         ; preds = %ENDIF2582
-  br label %ENDIF2588
-
-ENDIF2588:                                        ; preds = %ELSE2590, %IF2589
-  %216 = fsub float 1.000000e+00, %215
-  %217 = call float @llvm.sqrt.f32(float %216)
-  %218 = fmul float %217, undef
-  %219 = fadd float %218, undef
-  br label %ENDIF2564
-
-ELSE2593:                                         ; preds = %ELSE2566
-  %220 = fcmp oeq float %temp292.11, %82
-  %221 = fcmp olt float %82, %84
-  %222 = and i1 %220, %221
-  br i1 %222, label %ENDIF2594, label %ELSE2596
-
-ELSE2596:                                         ; preds = %ELSE2593
-  %223 = fcmp oeq float %temp292.11, %101
-  %224 = fcmp olt float %101, %103
-  %225 = and i1 %223, %224
-  br i1 %225, label %ENDIF2594, label %ELSE2632
-
-ENDIF2594:                                        ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593
-  %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ]
-  %226 = fmul float %temp894.2, undef
-  br label %ENDIF2564
-
-ELSE2632:                                         ; preds = %ELSE2596
-  br i1 undef, label %ENDIF2594, label %ELSE2650
-
-ELSE2650:                                         ; preds = %ELSE2632
-  %227 = fcmp oeq float %temp292.11, %111
-  %228 = fcmp olt float %111, %112
-  %229 = and i1 %227, %228
-  br i1 %229, label %IF2667, label %ELSE2668
-
-IF2667:                                           ; preds = %ELSE2650
-  br i1 undef, label %ENDIF2594, label %ELSE2671
-
-ELSE2668:                                         ; preds = %ELSE2650
-  %230 = fcmp oeq float %temp292.11, %129
-  %231 = fcmp olt float %129, undef
-  %232 = and i1 %230, %231
-  br i1 %232, label %ENDIF2594, label %ELSE2686
-
-ELSE2671:                                         ; preds = %IF2667
-  br label %ENDIF2594
-
-ELSE2686:                                         ; preds = %ELSE2668
-  %233 = fcmp oeq float %temp292.11, %146
-  %234 = fcmp olt float %146, undef
-  %235 = and i1 %233, %234
-  br i1 %235, label %ENDIF2594, label %ELSE2704
-
-ELSE2704:                                         ; preds = %ELSE2686
-  %236 = fcmp oeq float %temp292.11, %181
-  %237 = fcmp olt float %181, undef
-  %238 = and i1 %236, %237
-  br i1 %238, label %ENDIF2594, label %ELSE2740
-
-ELSE2740:                                         ; preds = %ELSE2704
-  br i1 undef, label %IF2757, label %ELSE2758
-
-IF2757:                                           ; preds = %ELSE2740
-  br i1 undef, label %ENDIF2594, label %ELSE2761
-
-ELSE2758:                                         ; preds = %ELSE2740
-  br i1 undef, label %IF2775, label %ENDIF2594
-
-ELSE2761:                                         ; preds = %IF2757
-  br label %ENDIF2594
-
-IF2775:                                           ; preds = %ELSE2758
-  %239 = fcmp olt float undef, undef
-  br i1 %239, label %ENDIF2594, label %ELSE2779
-
-ELSE2779:                                         ; preds = %IF2775
-  br i1 undef, label %ENDIF2594, label %ELSE2782
-
-ELSE2782:                                         ; preds = %ELSE2779
-  br i1 undef, label %ENDIF2594, label %ELSE2785
-
-ELSE2785:                                         ; preds = %ELSE2782
-  %240 = fcmp olt float undef, 0.000000e+00
-  br i1 %240, label %ENDIF2594, label %ELSE2788
-
-ELSE2788:                                         ; preds = %ELSE2785
-  %241 = fcmp olt float 0.000000e+00, undef
-  %.2848 = select i1 %241, float -1.000000e+00, float 1.000000e+00
-  br label %ENDIF2594
-
-ELSE2797:                                         ; preds = %ENDIF2564
-  %242 = fsub float %8, undef
-  %243 = fsub float %9, undef
-  %244 = fsub float %10, undef
-  %245 = fmul float %242, undef
-  %246 = fmul float %243, undef
-  %247 = fmul float %244, undef
-  %248 = fsub float %11, undef
-  %249 = fsub float %12, undef
-  %250 = fsub float %13, undef
-  %251 = fmul float %248, undef
-  %252 = fmul float %249, undef
-  %253 = fmul float %250, undef
-  %254 = call float @llvm.minnum.f32(float %245, float %251)
-  %255 = call float @llvm.minnum.f32(float %246, float %252)
-  %256 = call float @llvm.maxnum.f32(float %247, float %253)
-  %257 = call float @llvm.maxnum.f32(float %254, float %255)
-  %258 = call float @llvm.maxnum.f32(float %257, float undef)
-  %259 = call float @llvm.minnum.f32(float undef, float %256)
-  %260 = fcmp ogt float %258, 0.000000e+00
-  %261 = fcmp olt float %258, 1.000000e+00
-  %262 = and i1 %260, %261
-  %263 = fcmp olt float %258, %259
-  %264 = and i1 %262, %263
-  br i1 %264, label %ENDIF2795, label %ELSE2800
-
-ENDIF2795:                                        ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564
-  br label %LOOP
-
-ELSE2800:                                         ; preds = %ELSE2797
-  br i1 undef, label %ENDIF2795, label %ELSE2803
-
-ELSE2803:                                         ; preds = %ELSE2800
-  %265 = fsub float %20, undef
-  %266 = fsub float %21, undef
-  %267 = fsub float %22, undef
-  %268 = fmul float %265, undef
-  %269 = fmul float %266, undef
-  %270 = fmul float %267, 0.000000e+00
-  %271 = fsub float %23, undef
-  %272 = fsub float %24, undef
-  %273 = fsub float %25, undef
-  %274 = fmul float %271, undef
-  %275 = fmul float %272, undef
-  %276 = fmul float %273, undef
-  %277 = call float @llvm.minnum.f32(float %268, float %274)
-  %278 = call float @llvm.maxnum.f32(float %269, float %275)
-  %279 = call float @llvm.maxnum.f32(float %270, float %276)
-  %280 = call float @llvm.maxnum.f32(float %277, float undef)
-  %281 = call float @llvm.maxnum.f32(float %280, float undef)
-  %282 = call float @llvm.minnum.f32(float undef, float %278)
-  %283 = call float @llvm.minnum.f32(float %282, float %279)
-  %284 = fcmp ogt float %281, 0.000000e+00
-  %285 = fcmp olt float %281, 1.000000e+00
-  %286 = and i1 %284, %285
-  %287 = fcmp olt float %281, %283
-  %288 = and i1 %286, %287
-  br i1 %288, label %ENDIF2795, label %ELSE2806
-
-ELSE2806:                                         ; preds = %ELSE2803
-  %289 = fsub float %26, undef
-  %290 = fsub float %27, undef
-  %291 = fsub float %28, undef
-  %292 = fmul float %289, undef
-  %293 = fmul float %290, 0.000000e+00
-  %294 = fmul float %291, undef
-  %295 = fsub float %29, undef
-  %296 = fmul float %295, undef
-  %297 = call float @llvm.minnum.f32(float %292, float %296)
-  %298 = call float @llvm.minnum.f32(float %293, float undef)
-  %299 = call float @llvm.maxnum.f32(float %294, float undef)
-  %300 = call float @llvm.maxnum.f32(float %297, float %298)
-  %301 = call float @llvm.maxnum.f32(float %300, float undef)
-  %302 = call float @llvm.minnum.f32(float undef, float %299)
-  %303 = fcmp ogt float %301, 0.000000e+00
-  %304 = fcmp olt float %301, 1.000000e+00
-  %305 = and i1 %303, %304
-  %306 = fcmp olt float %301, %302
-  %307 = and i1 %305, %306
-  br i1 %307, label %ENDIF2795, label %ELSE2809
-
-ELSE2809:                                         ; preds = %ELSE2806
-  br i1 undef, label %ENDIF2795, label %ELSE2812
-
-ELSE2812:                                         ; preds = %ELSE2809
-  br i1 undef, label %ENDIF2795, label %ELSE2815
-
-ELSE2815:                                         ; preds = %ELSE2812
-  br i1 undef, label %ENDIF2795, label %ELSE2818
-
-ELSE2818:                                         ; preds = %ELSE2815
-  br i1 undef, label %ENDIF2795, label %ELSE2821
-
-ELSE2821:                                         ; preds = %ELSE2818
-  %308 = fsub float %56, undef
-  %309 = fsub float %57, undef
-  %310 = fsub float %58, undef
-  %311 = fmul float %308, undef
-  %312 = fmul float %309, 0.000000e+00
-  %313 = fmul float %310, undef
-  %314 = fsub float %59, undef
-  %315 = fsub float %60, undef
-  %316 = fsub float %61, undef
-  %317 = fmul float %314, undef
-  %318 = fmul float %315, undef
-  %319 = fmul float %316, undef
-  %320 = call float @llvm.maxnum.f32(float %311, float %317)
-  %321 = call float @llvm.maxnum.f32(float %312, float %318)
-  %322 = call float @llvm.maxnum.f32(float %313, float %319)
-  %323 = call float @llvm.minnum.f32(float %320, float %321)
-  %324 = call float @llvm.minnum.f32(float %323, float %322)
-  %325 = fcmp ogt float undef, 0.000000e+00
-  %326 = fcmp olt float undef, 1.000000e+00
-  %327 = and i1 %325, %326
-  %328 = fcmp olt float undef, %324
-  %329 = and i1 %327, %328
-  br i1 %329, label %ENDIF2795, label %ELSE2824
-
-ELSE2824:                                         ; preds = %ELSE2821
-  %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00
-  br label %ENDIF2795
-}
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDIL.fraction.(float) #2
-
-; Function Attrs: nounwind readnone
-declare float @llvm.sqrt.f32(float) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.minnum.f32(float, float) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.maxnum.f32(float, float) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.lrp(float, float, float) #2
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
diff --git a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
deleted file mode 100644
index 5a6129aaa3f..00000000000
--- a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
+++ /dev/null
@@ -1,236 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
-
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-declare void @llvm.AMDGPU.barrier.local() #2
-
-
-@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4
-@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8
-@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8
-
-; FUNC-LABEL: @reorder_local_load_global_store_local_load
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
-; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
-; CI: buffer_store_dword
-define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
-
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
-
-  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
-  store i32 99, i32 addrspace(1)* %gptr, align 4
-  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
-
-  %add = add nsw i32 %tmp1, %tmp2
-
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
-; CI: buffer_store_dword
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
-define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
-
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
-
-  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
-  store volatile i32 99, i32 addrspace(1)* %gptr, align 4
-  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
-
-  %add = add nsw i32 %tmp1, %tmp2
-
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
-; CI: buffer_store_dword
-define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
-
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
-
-  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
-  store i32 99, i32 addrspace(1)* %gptr, align 4
-  call void @llvm.AMDGPU.barrier.local() #2
-  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
-
-  %add = add nsw i32 %tmp1, %tmp2
-
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; Technically we could reorder these, but just comparing the
-; instruction type of the load is insufficient.
-
-; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load
-; CI: buffer_load_dword
-; CI: buffer_store_dword
-; CI: buffer_load_dword
-; CI: buffer_store_dword
-define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
-
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
-
-  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
-  store i32 99, i32 addrspace(1)* %gptr, align 4
-  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
-
-  %add = add nsw i32 %tmp1, %tmp2
-
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @reorder_constant_load_local_store_constant_load
-; CI: buffer_load_dword
-; CI: buffer_load_dword
-; CI: ds_write_b32
-; CI: buffer_store_dword
-define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
-  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
-
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
-
-  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
-  store i32 99, i32 addrspace(3)* %lptr, align 4
-  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
-
-  %add = add nsw i32 %tmp1, %tmp2
-
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @reorder_smrd_load_local_store_smrd_load
-; CI: s_load_dword
-; CI: s_load_dword
-; CI: s_load_dword
-; CI: ds_write_b32
-; CI: buffer_store_dword
-define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
-
-  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
-  store i32 99, i32 addrspace(3)* %lptr, align 4
-  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
-
-  %add = add nsw i32 %tmp1, %tmp2
-
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @reorder_global_load_local_store_global_load
-; CI: buffer_load_dword
-; CI: buffer_load_dword
-; CI: ds_write_b32
-; CI: buffer_store_dword
-define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 2
-
-  %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4
-  store i32 99, i32 addrspace(3)* %lptr, align 4
-  %tmp2 = load i32, i32 addrspace(1)* %ptr2, align 4
-
-  %add = add nsw i32 %tmp1, %tmp2
-
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @reorder_local_offsets
-; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
-; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
-; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
-; CI: buffer_store_dword
-; CI: s_endpgm
-define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100
-  %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 101
-
-  store i32 123, i32 addrspace(3)* %ptr1, align 4
-  %tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4
-  %tmp2 = load i32, i32 addrspace(3)* %ptr3, align 4
-  store i32 123, i32 addrspace(3)* %ptr2, align 4
-  %tmp3 = load i32, i32 addrspace(3)* %ptr1, align 4
-  store i32 789, i32 addrspace(3)* %ptr3, align 4
-
-  %add.0 = add nsw i32 %tmp2, %tmp1
-  %add.1 = add nsw i32 %add.0, %tmp3
-  store i32 %add.1, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @reorder_global_offsets
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
-; CI: buffer_store_dword
-; CI: s_endpgm
-define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100
-  %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 101
-
-  store i32 123, i32 addrspace(1)* %ptr1, align 4
-  %tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4
-  %tmp2 = load i32, i32 addrspace(1)* %ptr3, align 4
-  store i32 123, i32 addrspace(1)* %ptr2, align 4
-  %tmp3 = load i32, i32 addrspace(1)* %ptr1, align 4
-  store i32 789, i32 addrspace(1)* %ptr3, align 4
-
-  %add.0 = add nsw i32 %tmp2, %tmp1
-  %add.1 = add nsw i32 %add.0, %tmp3
-  store i32 %add.1, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; XFUNC-LABEL: @reorder_local_load_tbuffer_store_local_load
-; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4
-; XCI: TBUFFER_STORE_FORMAT
-; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8
-; define void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #1 {
-;   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
-
-;   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-;   %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
-
-;   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
-
-;   %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-;   call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
-;         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
-;         i32 1, i32 0)
-
-;   %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
-
-;   %add = add nsw i32 %tmp1, %tmp2
-
-;   store i32 %add, i32 addrspace(1)* %out, align 4
-;   ret void
-; }
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #2 = { nounwind noduplicate }
diff --git a/test/CodeGen/R600/si-vector-hang.ll b/test/CodeGen/R600/si-vector-hang.ll
deleted file mode 100644
index bd427dd3ed4..00000000000
--- a/test/CodeGen/R600/si-vector-hang.ll
+++ /dev/null
@@ -1,105 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; CHECK: {{^}}test_8_min_char:
-; CHECK: buffer_store_byte
-; CHECK: buffer_store_byte
-; CHECK: buffer_store_byte
-; CHECK: buffer_store_byte
-; CHECK: buffer_store_byte
-; CHECK: buffer_store_byte
-; CHECK: buffer_store_byte
-; CHECK: buffer_store_byte
-; ModuleID = 'radeon'
-
-define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
-entry:
-  %0 = load i8, i8 addrspace(1)* %in0, align 1
-  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
-  %arrayidx2.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 1
-  %2 = load i8, i8 addrspace(1)* %arrayidx2.i.i, align 1
-  %3 = insertelement <8 x i8> %1, i8 %2, i32 1
-  %arrayidx6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 2
-  %4 = load i8, i8 addrspace(1)* %arrayidx6.i.i, align 1
-  %5 = insertelement <8 x i8> %3, i8 %4, i32 2
-  %arrayidx10.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 3
-  %6 = load i8, i8 addrspace(1)* %arrayidx10.i.i, align 1
-  %7 = insertelement <8 x i8> %5, i8 %6, i32 3
-  %arrayidx.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 4
-  %8 = load i8, i8 addrspace(1)* %arrayidx.i.i, align 1
-  %9 = insertelement <8 x i8> undef, i8 %8, i32 0
-  %arrayidx2.i9.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 5
-  %10 = load i8, i8 addrspace(1)* %arrayidx2.i9.i, align 1
-  %11 = insertelement <8 x i8> %9, i8 %10, i32 1
-  %arrayidx6.i11.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 6
-  %12 = load i8, i8 addrspace(1)* %arrayidx6.i11.i, align 1
-  %13 = insertelement <8 x i8> %11, i8 %12, i32 2
-  %arrayidx10.i13.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 7
-  %14 = load i8, i8 addrspace(1)* %arrayidx10.i13.i, align 1
-  %15 = insertelement <8 x i8> %13, i8 %14, i32 3
-  %vecinit5.i = shufflevector <8 x i8> %7, <8 x i8> %15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  %16 = load i8, i8 addrspace(1)* %in1, align 1
-  %17 = insertelement <8 x i8> undef, i8 %16, i32 0
-  %arrayidx2.i.i4 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 1
-  %18 = load i8, i8 addrspace(1)* %arrayidx2.i.i4, align 1
-  %19 = insertelement <8 x i8> %17, i8 %18, i32 1
-  %arrayidx6.i.i5 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 2
-  %20 = load i8, i8 addrspace(1)* %arrayidx6.i.i5, align 1
-  %21 = insertelement <8 x i8> %19, i8 %20, i32 2
-  %arrayidx10.i.i6 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 3
-  %22 = load i8, i8 addrspace(1)* %arrayidx10.i.i6, align 1
-  %23 = insertelement <8 x i8> %21, i8 %22, i32 3
-  %arrayidx.i.i7 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 4
-  %24 = load i8, i8 addrspace(1)* %arrayidx.i.i7, align 1
-  %25 = insertelement <8 x i8> undef, i8 %24, i32 0
-  %arrayidx2.i9.i8 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 5
-  %26 = load i8, i8 addrspace(1)* %arrayidx2.i9.i8, align 1
-  %27 = insertelement <8 x i8> %25, i8 %26, i32 1
-  %arrayidx6.i11.i9 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 6
-  %28 = load i8, i8 addrspace(1)* %arrayidx6.i11.i9, align 1
-  %29 = insertelement <8 x i8> %27, i8 %28, i32 2
-  %arrayidx10.i13.i10 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 7
-  %30 = load i8, i8 addrspace(1)* %arrayidx10.i13.i10, align 1
-  %31 = insertelement <8 x i8> %29, i8 %30, i32 3
-  %vecinit5.i11 = shufflevector <8 x i8> %23, <8 x i8> %31, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  %cmp.i = icmp slt <8 x i8> %vecinit5.i, %vecinit5.i11
-  %cond.i = select <8 x i1> %cmp.i, <8 x i8> %vecinit5.i, <8 x i8> %vecinit5.i11
-  %32 = extractelement <8 x i8> %cond.i, i32 0
-  store i8 %32, i8 addrspace(1)* %out, align 1
-  %33 = extractelement <8 x i8> %cond.i, i32 1
-  %arrayidx2.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
-  store i8 %33, i8 addrspace(1)* %arrayidx2.i.i.i, align 1
-  %34 = extractelement <8 x i8> %cond.i, i32 2
-  %arrayidx.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 2
-  store i8 %34, i8 addrspace(1)* %arrayidx.i.i.i, align 1
-  %35 = extractelement <8 x i8> %cond.i, i32 3
-  %arrayidx2.i6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 3
-  store i8 %35, i8 addrspace(1)* %arrayidx2.i6.i.i, align 1
-  %arrayidx.i.i3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4
-  %36 = extractelement <8 x i8> %cond.i, i32 4
-  store i8 %36, i8 addrspace(1)* %arrayidx.i.i3, align 1
-  %37 = extractelement <8 x i8> %cond.i, i32 5
-  %arrayidx2.i.i6.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 5
-  store i8 %37, i8 addrspace(1)* %arrayidx2.i.i6.i, align 1
-  %38 = extractelement <8 x i8> %cond.i, i32 6
-  %arrayidx.i.i7.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 6
-  store i8 %38, i8 addrspace(1)* %arrayidx.i.i7.i, align 1
-  %39 = extractelement <8 x i8> %cond.i, i32 7
-  %arrayidx2.i6.i8.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 7
-  store i8 %39, i8 addrspace(1)* %arrayidx2.i6.i8.i, align 1
-  ret void
-}
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
-
-!0 = !{null}
-!1 = !{null}
-!2 = !{null}
-!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @test_8_min_char}
-!4 = !{null}
-!5 = !{null}
-!6 = !{null}
-!7 = !{null}
-!8 = !{null}
diff --git a/test/CodeGen/R600/sign_extend.ll b/test/CodeGen/R600/sign_extend.ll
deleted file mode 100644
index 06bee114c23..00000000000
--- a/test/CodeGen/R600/sign_extend.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}s_sext_i1_to_i32:
-; SI: v_cndmask_b32_e64
-; SI: s_endpgm
-define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %cmp = icmp eq i32 %a, %b
-  %sext = sext i1 %cmp to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_s_sext_i32_to_i64:
-; SI: s_ashr_i32
-; SI: s_endpg
-define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
-entry:
-  %mul = mul i32 %a, %b
-  %add = add i32 %mul, %c
-  %sext = sext i32 %add to i64
-  store i64 %sext, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}s_sext_i1_to_i64:
-; SI: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc
-; SI: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
-; SI: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
-; SI: s_endpgm
-define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %cmp = icmp eq i32 %a, %b
-  %sext = sext i1 %cmp to i64
-  store i64 %sext, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}s_sext_i32_to_i64:
-; SI: s_ashr_i32
-; SI: s_endpgm
-define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
-  %sext = sext i32 %a to i64
-  store i64 %sext, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}v_sext_i32_to_i64:
-; SI: v_ashr
-; SI: s_endpgm
-define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %sext = sext i32 %val to i64
-  store i64 %sext, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}s_sext_i16_to_i64:
-; SI: s_endpgm
-define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
-  %sext = sext i16 %a to i64
-  store i64 %sext, i64 addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
deleted file mode 100644
index dffee70b6b0..00000000000
--- a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; XFAIL: *
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s
-
-; 64-bit select was originally lowered with a build_pair, and this
-; could be simplified to 1 cndmask instead of 2, but that broken when
-; it started being implemented with a v2i32 build_vector and
-; bitcasting.
-define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) {
-  %cmp = icmp eq i32 %c, 0
-  %select = select i1 %cmp, i64 %a, i64 %b
-  %trunc = trunc i64 %select to i32
-  store i32 %trunc, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FIXME: Fix truncating store for local memory
-; SI-LABEL: {{^}}trunc_load_alloca_i64:
-; SI: v_movrels_b32
-; SI-NOT: v_movrels_b32
-; SI: s_endpgm
-define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) {
-  %idx = add i32 %a, %b
-  %alloca = alloca i64, i32 4
-  %gep0 = getelementptr i64, i64* %alloca, i64 0
-  %gep1 = getelementptr i64, i64* %alloca, i64 1
-  %gep2 = getelementptr i64, i64* %alloca, i64 2
-  %gep3 = getelementptr i64, i64* %alloca, i64 3
-  store i64 24, i64* %gep0, align 8
-  store i64 9334, i64* %gep1, align 8
-  store i64 3935, i64* %gep2, align 8
-  store i64 9342, i64* %gep3, align 8
-  %gep = getelementptr i64, i64* %alloca, i32 %idx
-  %load = load i64, i64* %gep, align 8
-  %mask = and i64 %load, 4294967296
-  %add = add i64 %mask, -1
-  store i64 %add, i64 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/sint_to_fp.f64.ll b/test/CodeGen/R600/sint_to_fp.f64.ll
deleted file mode 100644
index da4e91db3a3..00000000000
--- a/test/CodeGen/R600/sint_to_fp.f64.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; SI-LABEL: {{^}}sint_to_fp_i32_to_f64
-; SI: v_cvt_f64_i32_e32
-define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
-  %result = sitofp i32 %in to double
-  store double %result, double addrspace(1)* %out
-  ret void
-}
-
-; FIXME: select on 0, 0
-; SI-LABEL: {{^}}sint_to_fp_i1_f64:
-; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
-; uses an SGPR for [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
-  %cmp = icmp eq i32 %in, 0
-  %fp = sitofp i1 %cmp to double
-  store double %fp, double addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}sint_to_fp_i1_f64_load:
-; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, -1
-; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-; SI: s_endpgm
-define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
-  %fp = sitofp i1 %in to double
-  store double %fp, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: @s_sint_to_fp_i64_to_f64
-define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
-  %result = sitofp i64 %in to double
-  store double %result, double addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: @v_sint_to_fp_i64_to_f64
-; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; SI: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
-; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
-; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
-; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep, align 8
-  %result = sitofp i64 %val to double
-  store double %result, double addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll
deleted file mode 100644
index 8506441d136..00000000000
--- a/test/CodeGen/R600/sint_to_fp.ll
+++ /dev/null
@@ -1,64 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-
-; FUNC-LABEL: {{^}}s_sint_to_fp_i32_to_f32:
-; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}}
-define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) {
-  %result = sitofp i32 %in to float
-  store float %result, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sint_to_fp_v2i32:
-; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
-; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-
-; SI: v_cvt_f32_i32_e32
-; SI: v_cvt_f32_i32_e32
-define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
-  %result = sitofp <2 x i32> %in to <2 x float>
-  store <2 x float> %result, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sint_to_fp_v4i32:
-; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_cvt_f32_i32_e32
-; SI: v_cvt_f32_i32_e32
-; SI: v_cvt_f32_i32_e32
-; SI: v_cvt_f32_i32_e32
-define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %result = sitofp <4 x i32> %value to <4 x float>
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sint_to_fp_i1_f32:
-; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define void @sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
-  %cmp = icmp eq i32 %in, 0
-  %fp = uitofp i1 %cmp to float
-  store float %fp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sint_to_fp_i1_f32_load:
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define void @sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) {
-  %fp = sitofp i1 %in to float
-  store float %fp, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/smrd.ll b/test/CodeGen/R600/smrd.ll
deleted file mode 100644
index b0c18ca5959..00000000000
--- a/test/CodeGen/R600/smrd.ll
+++ /dev/null
@@ -1,111 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
-
-; SMRD load with an immediate offset.
-; GCN-LABEL: {{^}}smrd0:
-; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
-; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
-entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; SMRD load with the largest possible immediate offset.
-; GCN-LABEL: {{^}}smrd1:
-; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
-; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
-entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; SMRD load with an offset greater than the largest possible immediate.
-; GCN-LABEL: {{^}}smrd2:
-; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
-; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
-; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
-; GCN: s_endpgm
-define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
-entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; SMRD load with a 64-bit offset
-; GCN-LABEL: {{^}}smrd3:
-; FIXME: There are too many copies here because we don't fold immediates
-;        through REG_SEQUENCE
-; SI: s_mov_b32 s[[SLO:[0-9]+]], 0 ;
-; SI: s_mov_b32 s[[SHI:[0-9]+]], 4
-; SI: s_mov_b32 s[[SSLO:[0-9]+]], s[[SLO]]
-; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SSLO]]
-; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
-; FIXME: We should be able to use s_load_dword here
-; SI: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64
-; TODO: Add VI checks
-; GCN: s_endpgm
-define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
-entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; SMRD load using the load.const intrinsic with an immediate offset
-; GCN-LABEL: {{^}}smrd_load_const0:
-; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
-; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
-define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
-  ret void
-}
-
-; SMRD load using the load.const intrinsic with the largest possible immediate
-; offset.
-; GCN-LABEL: {{^}}smrd_load_const1:
-; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
-; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
-  ret void
-}
-; SMRD load using the load.const intrinsic with an offset greater than the
-; largets possible immediate.
-; immediate offset.
-; GCN-LABEL: {{^}}smrd_load_const2:
-; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
-; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
-; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
-define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/split-scalar-i64-add.ll b/test/CodeGen/R600/split-scalar-i64-add.ll
deleted file mode 100644
index 46409cdfae1..00000000000
--- a/test/CodeGen/R600/split-scalar-i64-add.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.r600.read.tidig.x() readnone
-
-; This is broken because the low half of the 64-bit add remains on the
-; SALU, but the upper half does not. The addc expects the carry bit
-; set in vcc, which is undefined since the low scalar half add sets
-; scc instead.
-
-; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
-; SI: v_add_i32
-; SI: v_addc_u32
-define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
-  %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0
-  %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1
-  %bc = bitcast <2 x i32> %vec.1 to i64
-  %add = add i64 %bc, 399
-  store i64 %add, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1:
-; SI: v_add_i32
-; SI: v_addc_u32
-define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
-  %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
-  %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1
-  %bc = bitcast <2 x i32> %vec.1 to i64
-  %add = add i64 %bc, %val1
-  store i64 %add, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; Doesn't use constants
-; FUNC-LABEL @imp_def_vcc_split_i64_add_2
-; SI: v_add_i32
-; SI: v_addc_u32
-define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
-  %tid = call i32 @llvm.r600.read.tidig.x() readnone
-  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %load = load i32, i32 addrspace(1)* %gep
-  %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
-  %vec.1 = insertelement <2 x i32> %vec.0, i32 %load, i32 1
-  %bc = bitcast <2 x i32> %vec.1 to i64
-  %add = add i64 %bc, %val1
-  store i64 %add, i64 addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/sra.ll b/test/CodeGen/R600/sra.ll
deleted file mode 100644
index bcbc32f4c05..00000000000
--- a/test/CodeGen/R600/sra.ll
+++ /dev/null
@@ -1,213 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s
-
-;EG-LABEL: {{^}}ashr_v2i32:
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-;SI-LABEL: {{^}}ashr_v2i32:
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-;VI-LABEL: {{^}}ashr_v2i32:
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
-  %result = ashr <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-;EG-LABEL: {{^}}ashr_v4i32:
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-;SI-LABEL: {{^}}ashr_v4i32:
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-;VI-LABEL: {{^}}ashr_v4i32:
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
-  %result = ashr <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-;EG-LABEL: {{^}}ashr_i64:
-;EG: ASHR
-
-;SI-LABEL: {{^}}ashr_i64:
-;SI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
-
-;VI-LABEL: {{^}}ashr_i64:
-;VI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
-
-define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = sext i32 %in to i64
-  %1 = ashr i64 %0, 8
-  store i64 %1, i64 addrspace(1)* %out
-  ret void
-}
-
-;EG-LABEL: {{^}}ashr_i64_2:
-;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
-;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
-;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
-;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
-;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
-;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-
-;SI-LABEL: {{^}}ashr_i64_2:
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
-;VI-LABEL: {{^}}ashr_i64_2:
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-
-define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-entry:
-  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
-  %a = load i64, i64 addrspace(1) * %in
-  %b = load i64, i64 addrspace(1) * %b_ptr
-  %result = ashr i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-;EG-LABEL: {{^}}ashr_v2i64:
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: ASHR {{.*}}, [[SHA]]
-;EG-DAG: ASHR {{.*}}, [[SHB]]
-;EG-DAG: LSHR {{.*}}, [[SHA]]
-;EG-DAG: LSHR {{.*}}, [[SHB]]
-;EG-DAG: OR_INT
-;EG-DAG: OR_INT
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ASHR
-;EG-DAG: ASHR
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-
-;SI-LABEL: {{^}}ashr_v2i64:
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
-;VI-LABEL: {{^}}ashr_v2i64:
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-
-define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
-  %a = load <2 x i64>, <2 x i64> addrspace(1) * %in
-  %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr
-  %result = ashr <2 x i64> %a, %b
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-;EG-LABEL: {{^}}ashr_v4i64:
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHC]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHD]]
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: ASHR {{.*}}, [[SHA]]
-;EG-DAG: ASHR {{.*}}, [[SHB]]
-;EG-DAG: ASHR {{.*}}, [[SHC]]
-;EG-DAG: ASHR {{.*}}, [[SHD]]
-;EG-DAG: LSHR {{.*}}, [[SHA]]
-;EG-DAG: LSHR {{.*}}, [[SHB]]
-;EG-DAG: LSHR {{.*}}, [[SHA]]
-;EG-DAG: LSHR {{.*}}, [[SHB]]
-;EG-DAG: OR_INT
-;EG-DAG: OR_INT
-;EG-DAG: OR_INT
-;EG-DAG: OR_INT
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ASHR
-;EG-DAG: ASHR
-;EG-DAG: ASHR
-;EG-DAG: ASHR
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-
-;SI-LABEL: {{^}}ashr_v4i64:
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
-;VI-LABEL: {{^}}ashr_v4i64:
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-
-define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
-  %a = load <4 x i64>, <4 x i64> addrspace(1) * %in
-  %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr
-  %result = ashr <4 x i64> %a, %b
-  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
diff --git a/test/CodeGen/R600/srem.ll b/test/CodeGen/R600/srem.ll
deleted file mode 100644
index c78fd549b31..00000000000
--- a/test/CodeGen/R600/srem.ll
+++ /dev/null
@@ -1,112 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s
-
-define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in
-  %den = load i32, i32 addrspace(1) * %den_ptr
-  %result = srem i32 %num, %den
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %num = load i32, i32 addrspace(1) * %in
-  %result = srem i32 %num, 4
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}srem_i32_7:
-; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493
-; SI: v_mul_hi_i32 {{v[0-9]+}}, [[MAGIC]],
-; SI: v_mul_lo_i32
-; SI: v_sub_i32
-; SI: s_endpgm
-define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %num = load i32, i32 addrspace(1) * %in
-  %result = srem i32 %num, 7
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
-  %result = srem <2 x i32> %num, %den
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %result = srem <2 x i32> %num, <i32 4, i32 4>
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
-  %result = srem <4 x i32> %num, %den
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %result = srem <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
-  %num = load i64, i64 addrspace(1) * %in
-  %den = load i64, i64 addrspace(1) * %den_ptr
-  %result = srem i64 %num, %den
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %num = load i64, i64 addrspace(1) * %in
-  %result = srem i64 %num, 4
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
-  %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
-  %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
-  %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr
-  %result = srem <2 x i64> %num, %den
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
-  %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
-  %result = srem <2 x i64> %num, <i64 4, i64 4>
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
-  %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
-  %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
-  %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr
-  %result = srem <4 x i64> %num, %den
-  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
-  %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
-  %result = srem <4 x i64> %num, <i64 4, i64 4, i64 4, i64 4>
-  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/srl.ll b/test/CodeGen/R600/srl.ll
deleted file mode 100644
index 4904d7fa1bd..00000000000
--- a/test/CodeGen/R600/srl.ll
+++ /dev/null
@@ -1,186 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}lshr_i32:
-; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
-  %result = lshr i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lshr_v2i32:
-; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
-  %result = lshr <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lshr_v4i32:
-; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
-  %result = lshr <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lshr_i64:
-; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-
-; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
-; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
-; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
-; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
-; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
-define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
-  %a = load i64, i64 addrspace(1)* %in
-  %b = load i64, i64 addrspace(1)* %b_ptr
-  %result = lshr i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lshr_v2i64:
-; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
-; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-
-; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-; EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
-; EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
-; EG-DAG: LSHL {{.*}}, 1
-; EG-DAG: LSHL {{.*}}, 1
-; EG-DAG: LSHR {{.*}}, [[SHA]]
-; EG-DAG: LSHR {{.*}}, [[SHB]]
-; EG-DAG: LSHR {{.*}}, [[SHA]]
-; EG-DAG: LSHR {{.*}}, [[SHB]]
-; EG-DAG: OR_INT
-; EG-DAG: OR_INT
-; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-; EG-DAG: CNDE_INT {{.*}}, 0.0
-; EG-DAG: CNDE_INT {{.*}}, 0.0
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
-  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
-  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
-  %result = lshr <2 x i64> %a, %b
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lshr_v4i64:
-; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
-; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-
-; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-; EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
-; EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
-; EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
-; EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
-; EG-DAG: LSHL {{\*? *}}[[COMPSHC]]
-; EG-DAG: LSHL {{\*? *}}[[COMPSHD]]
-; EG-DAG: LSHL {{.*}}, 1
-; EG-DAG: LSHL {{.*}}, 1
-; EG-DAG: LSHL {{.*}}, 1
-; EG-DAG: LSHL {{.*}}, 1
-; EG-DAG: LSHR {{.*}}, [[SHA]]
-; EG-DAG: LSHR {{.*}}, [[SHB]]
-; EG-DAG: LSHR {{.*}}, [[SHC]]
-; EG-DAG: LSHR {{.*}}, [[SHD]]
-; EG-DAG: LSHR {{.*}}, [[SHA]]
-; EG-DAG: LSHR {{.*}}, [[SHB]]
-; EG-DAG: LSHR {{.*}}, [[SHC]]
-; EG-DAG: LSHR {{.*}}, [[SHD]]
-; EG-DAG: OR_INT
-; EG-DAG: OR_INT
-; EG-DAG: OR_INT
-; EG-DAG: OR_INT
-; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
-; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: LSHR
-; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
-; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
-; EG-DAG: CNDE_INT {{.*}}, 0.0
-; EG-DAG: CNDE_INT {{.*}}, 0.0
-; EG-DAG: CNDE_INT {{.*}}, 0.0
-; EG-DAG: CNDE_INT {{.*}}, 0.0
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
-  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
-  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
-  %result = lshr <4 x i64> %a, %b
-  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/ssubo.ll b/test/CodeGen/R600/ssubo.ll
deleted file mode 100644
index 26884a1b776..00000000000
--- a/test/CodeGen/R600/ssubo.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
-
-declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
-declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
-
-; FUNC-LABEL: {{^}}ssubo_i64_zext:
-define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
-  %val = extractvalue { i64, i1 } %ssub, 0
-  %carry = extractvalue { i64, i1 } %ssub, 1
-  %ext = zext i1 %carry to i64
-  %add2 = add i64 %val, %ext
-  store i64 %add2, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_ssubo_i32:
-define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
-  %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
-  %val = extractvalue { i32, i1 } %ssub, 0
-  %carry = extractvalue { i32, i1 } %ssub, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ssubo_i32:
-define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
-  %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
-  %val = extractvalue { i32, i1 } %ssub, 0
-  %carry = extractvalue { i32, i1 } %ssub, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_ssubo_i64:
-; SI: s_sub_u32
-; SI: s_subb_u32
-define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
-  %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
-  %val = extractvalue { i64, i1 } %ssub, 0
-  %carry = extractvalue { i64, i1 } %ssub, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_ssubo_i64:
-; SI: v_sub_i32_e32
-; SI: v_subb_u32_e32
-define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %a = load i64, i64 addrspace(1)* %aptr, align 4
-  %b = load i64, i64 addrspace(1)* %bptr, align 4
-  %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
-  %val = extractvalue { i64, i1 } %ssub, 0
-  %carry = extractvalue { i64, i1 } %ssub, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
diff --git a/test/CodeGen/R600/store-barrier.ll b/test/CodeGen/R600/store-barrier.ll
deleted file mode 100644
index 4a72b4d090a..00000000000
--- a/test/CodeGen/R600/store-barrier.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
-
-; This test is for a bug in the machine scheduler where stores without
-; an underlying object would be moved across the barrier.  In this
-; test, the <2 x i8> store will be split into two i8 stores, so they
-; won't have an underlying object.
-
-; CHECK-LABEL: {{^}}test:
-; CHECK: ds_write_b8
-; CHECK: ds_write_b8
-; CHECK: s_barrier
-; CHECK: s_endpgm
-; Function Attrs: nounwind
-define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) {
-bb:
-  %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9
-  %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2
-  %tmp14 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp13
-  %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 2
-  %tmp16 = add i32 %tmp13, 1
-  %tmp17 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp16
-  store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 2
-  tail call void @llvm.AMDGPU.barrier.local() #2
-  %tmp25 = load i32, i32 addrspace(1)* %tmp10, align 4
-  %tmp26 = sext i32 %tmp25 to i64
-  %tmp27 = sext i32 %arg4 to i64
-  %tmp28 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 %arg4
-  %tmp29 = load i8, i8 addrspace(3)* %tmp28, align 1
-  %tmp30 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 %tmp27
-  store i8 %tmp29, i8 addrspace(1)* %tmp30, align 1
-  %tmp32 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 0
-  %tmp33 = load i8, i8 addrspace(3)* %tmp32, align 1
-  %tmp35 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 0
-  store i8 %tmp33, i8 addrspace(1)* %tmp35, align 1
-  ret void
-}
-
-; Function Attrs: noduplicate nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
-
-attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/R600/store-v3i32.ll b/test/CodeGen/R600/store-v3i32.ll
deleted file mode 100644
index 33617b55ed6..00000000000
--- a/test/CodeGen/R600/store-v3i32.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; XFAIL: *
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
-
-; 3 vectors have the same size and alignment as 4 vectors, so this
-; should be done in a single store.
-
-; SI-LABEL: {{^}}store_v3i32:
-; SI: buffer_store_dwordx4
-define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
-  store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16
-  ret void
-}
diff --git a/test/CodeGen/R600/store-v3i64.ll b/test/CodeGen/R600/store-v3i64.ll
deleted file mode 100644
index e0c554ad2c1..00000000000
--- a/test/CodeGen/R600/store-v3i64.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; XFAIL: *
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}global_store_v3i64:
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
-  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32
-  ret void
-}
-
-; SI-LABEL: {{^}}global_store_v3i64_unaligned:
-define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
-  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
-  ret void
-}
-
-; SI-LABEL: {{^}}local_store_v3i64:
-define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
-  store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
-  ret void
-}
-
-; SI-LABEL: {{^}}local_store_v3i64_unaligned:
-define void @local_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
-  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
-  ret void
-}
diff --git a/test/CodeGen/R600/store-vector-ptrs.ll b/test/CodeGen/R600/store-vector-ptrs.ll
deleted file mode 100644
index d5af3b29118..00000000000
--- a/test/CodeGen/R600/store-vector-ptrs.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s
-
-; This tests for a bug that caused a crash in
-; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting
-; scratch loads and stores.
-; CHECK-LABEL: {{^}}store_vector_ptrs:
-define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
-  %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
-  store <4 x i32*> %p, <4 x i32*>* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll
deleted file mode 100644
index 0f89405e073..00000000000
--- a/test/CodeGen/R600/store.ll
+++ /dev/null
@@ -1,369 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
-
-;===------------------------------------------------------------------------===;
-; Global Address Space
-;===------------------------------------------------------------------------===;
-; FUNC-LABEL: {{^}}store_i1:
-; EG: MEM_RAT MSKOR
-; SI: buffer_store_byte
-define void @store_i1(i1 addrspace(1)* %out) {
-entry:
-  store i1 true, i1 addrspace(1)* %out
-  ret void
-}
-
-; i8 store
-; FUNC-LABEL: {{^}}store_i8:
-; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
-
-; IG 0: Get the byte index and truncate the value
-; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
-; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
-; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
-; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
-
-
-; IG 1: Truncate the calculated the shift amount for the mask
-
-; IG 2: Shift the value and the mask
-; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
-; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
-; EG-NEXT: 255
-; IG 3: Initialize the Y and Z channels to zero
-;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
-; EG: MOV T[[RW_GPR]].Y, 0.0
-; EG: MOV * T[[RW_GPR]].Z, 0.0
-
-; SI: buffer_store_byte
-
-define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
-entry:
-  store i8 %in, i8 addrspace(1)* %out
-  ret void
-}
-
-; i16 store
-; FUNC-LABEL: {{^}}store_i16:
-; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
-
-; IG 0: Get the byte index and truncate the value
-
-
-; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
-; EG-NEXT: 3(4.203895e-45),
-
-; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
-; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
-
-; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
-; IG 1: Truncate the calculated the shift amount for the mask
-
-; IG 2: Shift the value and the mask
-; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
-; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
-; EG-NEXT: 65535
-; IG 3: Initialize the Y and Z channels to zero
-;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
-; EG: MOV T[[RW_GPR]].Y, 0.0
-; EG: MOV * T[[RW_GPR]].Z, 0.0
-
-; SI: buffer_store_short
-define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
-entry:
-  store i16 %in, i16 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}store_v2i8:
-; EG: MEM_RAT MSKOR
-; EG-NOT: MEM_RAT MSKOR
-
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
-entry:
-  %0 = trunc <2 x i32> %in to <2 x i8>
-  store <2 x i8> %0, <2 x i8> addrspace(1)* %out
-  ret void
-}
-
-
-; FUNC-LABEL: {{^}}store_v2i16:
-; EG: MEM_RAT_CACHELESS STORE_RAW
-
-; CM: MEM_RAT_CACHELESS STORE_DWORD
-
-; SI: buffer_store_short
-; SI: buffer_store_short
-define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
-entry:
-  %0 = trunc <2 x i32> %in to <2 x i16>
-  store <2 x i16> %0, <2 x i16> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}store_v4i8:
-; EG: MEM_RAT_CACHELESS STORE_RAW
-
-; CM: MEM_RAT_CACHELESS STORE_DWORD
-
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
-entry:
-  %0 = trunc <4 x i32> %in to <4 x i8>
-  store <4 x i8> %0, <4 x i8> addrspace(1)* %out
-  ret void
-}
-
-; floating-point store
-; FUNC-LABEL: {{^}}store_f32:
-; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1
-
-; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
-
-; SI: buffer_store_dword
-
-define void @store_f32(float addrspace(1)* %out, float %in) {
-  store float %in, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}store_v4i16:
-; EG: MEM_RAT MSKOR
-; EG: MEM_RAT MSKOR
-; EG: MEM_RAT MSKOR
-; EG: MEM_RAT MSKOR
-; EG-NOT: MEM_RAT MSKOR
-
-; SI: buffer_store_short
-; SI: buffer_store_short
-; SI: buffer_store_short
-; SI: buffer_store_short
-; SI-NOT: buffer_store_byte
-define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
-entry:
-  %0 = trunc <4 x i32> %in to <4 x i16>
-  store <4 x i16> %0, <4 x i16> addrspace(1)* %out
-  ret void
-}
-
-; vec2 floating-point stores
-; FUNC-LABEL: {{^}}store_v2f32:
-; EG: MEM_RAT_CACHELESS STORE_RAW
-
-; CM: MEM_RAT_CACHELESS STORE_DWORD
-
-; SI: buffer_store_dwordx2
-
-define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
-  %1 = insertelement <2 x float> %0, float %b, i32 1
-  store <2 x float> %1, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}store_v4i32:
-; EG: MEM_RAT_CACHELESS STORE_RAW
-; EG-NOT: MEM_RAT_CACHELESS STORE_RAW
-
-; CM: MEM_RAT_CACHELESS STORE_DWORD
-; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
-
-; SI: buffer_store_dwordx4
-define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
-entry:
-  store <4 x i32> %in, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}store_i64_i8:
-; EG: MEM_RAT MSKOR
-; SI: buffer_store_byte
-define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
-entry:
-  %0 = trunc i64 %in to i8
-  store i8 %0, i8 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}store_i64_i16:
-; EG: MEM_RAT MSKOR
-; SI: buffer_store_short
-define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
-entry:
-  %0 = trunc i64 %in to i16
-  store i16 %0, i16 addrspace(1)* %out
-  ret void
-}
-
-;===------------------------------------------------------------------------===;
-; Local Address Space
-;===------------------------------------------------------------------------===;
-
-; FUNC-LABEL: {{^}}store_local_i1:
-; EG: LDS_BYTE_WRITE
-; SI: ds_write_b8
-define void @store_local_i1(i1 addrspace(3)* %out) {
-entry:
-  store i1 true, i1 addrspace(3)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}store_local_i8:
-; EG: LDS_BYTE_WRITE
-
-; SI: ds_write_b8
-define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
-  store i8 %in, i8 addrspace(3)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}store_local_i16:
-; EG: LDS_SHORT_WRITE
-
-; SI: ds_write_b16
-define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
-  store i16 %in, i16 addrspace(3)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}store_local_v2i16:
-; EG: LDS_WRITE
-
-; CM: LDS_WRITE
-
-; SI: ds_write_b16
-; SI: ds_write_b16
-define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
-entry:
-  store <2 x i16> %in, <2 x i16> addrspace(3)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}store_local_v4i8:
-; EG: LDS_WRITE
-
-; CM: LDS_WRITE
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
-entry:
-  store <4 x i8> %in, <4 x i8> addrspace(3)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}store_local_v2i32:
-; EG: LDS_WRITE
-; EG: LDS_WRITE
-
-; CM: LDS_WRITE
-; CM: LDS_WRITE
-
-; SI: ds_write_b64
-define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
-entry:
-  store <2 x i32> %in, <2 x i32> addrspace(3)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}store_local_v4i32:
-; EG: LDS_WRITE
-; EG: LDS_WRITE
-; EG: LDS_WRITE
-; EG: LDS_WRITE
-
-; CM: LDS_WRITE
-; CM: LDS_WRITE
-; CM: LDS_WRITE
-; CM: LDS_WRITE
-
-; SI: ds_write_b32
-; SI: ds_write_b32
-; SI: ds_write_b32
-; SI: ds_write_b32
-define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
-entry:
-  store <4 x i32> %in, <4 x i32> addrspace(3)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}store_local_i64_i8:
-; EG: LDS_BYTE_WRITE
-; SI: ds_write_b8
-define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
-entry:
-  %0 = trunc i64 %in to i8
-  store i8 %0, i8 addrspace(3)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}store_local_i64_i16:
-; EG: LDS_SHORT_WRITE
-; SI: ds_write_b16
-define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
-entry:
-  %0 = trunc i64 %in to i16
-  store i16 %0, i16 addrspace(3)* %out
-  ret void
-}
-
-; The stores in this function are combined by the optimizer to create a
-; 64-bit store with 32-bit alignment.  This is legal for SI and the legalizer
-; should not try to split the 64-bit store back into 2 32-bit stores.
-;
-; Evergreen / Northern Islands don't support 64-bit stores yet, so there should
-; be two 32-bit stores.
-
-; FUNC-LABEL: {{^}}vecload2:
-; EG: MEM_RAT_CACHELESS STORE_RAW
-
-; CM: MEM_RAT_CACHELESS STORE_DWORD
-
-; SI: buffer_store_dwordx2
-define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
-entry:
-  %0 = load i32, i32 addrspace(2)* %mem, align 4
-  %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
-  %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
-  ret void
-}
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-; When i128 was a legal type this program generated cannot select errors:
-
-; FUNC-LABEL: {{^}}"i128-const-store":
-; FIXME: We should be able to to this with one store instruction
-; EG: STORE_RAW
-; EG: STORE_RAW
-; EG: STORE_RAW
-; EG: STORE_RAW
-; CM: STORE_DWORD
-; CM: STORE_DWORD
-; CM: STORE_DWORD
-; CM: STORE_DWORD
-; SI: buffer_store_dwordx4
-define void @i128-const-store(i32 addrspace(1)* %out) {
-entry:
-  store i32 1, i32 addrspace(1)* %out, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 1, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
-  store i32 2, i32 addrspace(1)* %arrayidx4, align 4
-  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
-  store i32 2, i32 addrspace(1)* %arrayidx6, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/store.r600.ll b/test/CodeGen/R600/store.r600.ll
deleted file mode 100644
index 696fb033b5e..00000000000
--- a/test/CodeGen/R600/store.r600.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-
-; XXX: Merge this test into store.ll once it is supported on SI
-
-; v4i32 store
-; EG: {{^}}store_v4i32:
-; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
-
-define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %1 = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; v4f32 store
-; EG: {{^}}store_v4f32:
-; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
-define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %1 = load <4 x float>, <4 x float> addrspace(1) * %in
-  store <4 x float> %1, <4 x float> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/structurize.ll b/test/CodeGen/R600/structurize.ll
deleted file mode 100644
index 02e592e9a55..00000000000
--- a/test/CodeGen/R600/structurize.ll
+++ /dev/null
@@ -1,83 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood -mattr=disable-irstructurizer | FileCheck %s
-; Test case for a crash in the AMDILCFGStructurizer from a CFG like this:
-;
-;                            entry
-;                           /     \
-;               diamond_head       branch_from
-;                 /      \           |
-;    diamond_false        diamond_true
-;                 \      /
-;                   done
-;
-; When the diamond_true branch had more than 100 instructions.
-;
-;
-
-; CHECK-LABEL: {{^}}branch_into_diamond:
-; === entry block:
-; CHECK: ALU_PUSH_BEFORE
-; === Branch instruction (IF):
-; CHECK: JUMP
-  ; === branch_from block
-  ; CHECK: ALU
-  ; === Duplicated diamond_true block (There can be more than one ALU clause):
-  ; === XXX: We should be able to optimize this so the basic block is not
-  ; === duplicated.  See comments in
-  ; === AMDGPUCFGStructurizer::improveSimpleJumpintoIf()
-  ; CHECK: ALU
-; === Branch instruction (ELSE):
-; CHECK: ELSE
-  ; === diamond_head block:
-  ; CHECK: ALU_PUSH_BEFORE
-  ; === Branch instruction (IF):
-  ; CHECK: JUMP
-    ; === diamond_true block (There can be more than one ALU clause):
-    ; ALU
-  ; === Branch instruction (ELSE):
-  ; CHECK: ELSE
-    ; === diamond_false block plus implicit ENDIF
-    ; CHECK: ALU_POP_AFTER
-; === Branch instruction (ENDIF):
-; CHECK: POP
-; === done block:
-; CHECK: ALU
-; CHECK: MEM_RAT_CACHELESS
-; CHECK: CF_END
-
-
-define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
-entry:
-%0 = icmp ne i32 %a, 0
-  br i1 %0, label %diamond_head, label %branch_from
-
-diamond_head:
-  %1 = icmp ne i32 %a, 1
-  br i1 %1, label %diamond_true, label %diamond_false
-
-branch_from:
-  %2 = add i32 %a, 1
-  br label %diamond_true
-
-diamond_false:
-  %3 = add i32 %a, 2
-  br label %done
-
-diamond_true:
-  %4 = phi i32 [%2, %branch_from], [%a, %diamond_head]
-  ; This block needs to be > 100 ISA instructions to hit the bug,
-  ; so we'll use udiv instructions.
-  %div0 = udiv i32 %a, %b
-  %div1 = udiv i32 %div0, %4
-  %div2 = udiv i32 %div1, 11
-  %div3 = udiv i32 %div2, %a
-  %div4 = udiv i32 %div3, %b
-  %div5 = udiv i32 %div4, %c
-  %div6 = udiv i32 %div5, %div0
-  %div7 = udiv i32 %div6, %div1
-  br label %done
-
-done:
-  %5 = phi i32 [%3, %diamond_false], [%div7, %diamond_true]
-  store i32 %5, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/structurize1.ll b/test/CodeGen/R600/structurize1.ll
deleted file mode 100644
index 77432c1f9d2..00000000000
--- a/test/CodeGen/R600/structurize1.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: llc < %s -march=r600 -mattr=disable-ifcvt -mcpu=redwood | FileCheck %s
-
-; This tests for abug where the AMDILCFGStructurizer was crashing on loops
-; like this:
-;
-; for (i = 0; i < x; i++) {
-;   if (cond0) {
-;     if (cond1) {
-;
-;     } else {
-;
-;     }
-;     if (cond2) {
-;
-;     }
-;   }
-; }
-
-; CHECK-LABEL: {{^}}if_inside_loop:
-; CHECK: LOOP_START_DX10
-; CHECK: END_LOOP
-define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
-entry:
-  br label %for.body
-
-for.body:
-  %0 = phi i32 [0, %entry], [%inc, %for.inc]
-  %val = phi i32 [0, %entry], [%val.for.inc, %for.inc]
-  %inc = add i32 %0, 1
-  %1 = icmp ult i32 10, %a
-  br i1 %1, label %for.inc, label %if.then
-
-if.then:
-  %2 = icmp ne i32 0, %b
-  br i1 %2, label %if.then.true, label %if.then.false
-
-if.then.true:
-  %3 = add i32 %a, %val
-  br label %if
-
-if.then.false:
-  %4 = mul i32 %a, %val
-  br label %if
-
-if:
-  %val.if = phi i32 [%3, %if.then.true], [%4, %if.then.false]
-  %5 = icmp ne i32 0, %c
-  br i1 %5, label %if.true, label %for.inc
-
-if.true:
-  %6 = add i32 %a, %val.if
-  br label %for.inc
-
-for.inc:
-  %val.for.inc = phi i32 [%val, %for.body], [%val.if, %if], [%6, %if.true]
-  %7 = icmp ne i32 0, %d
-  br i1 %7, label %for.body, label %exit
-
-exit:
-  store i32 %val.for.inc, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll
deleted file mode 100644
index b7fba0efa5b..00000000000
--- a/test/CodeGen/R600/sub.ll
+++ /dev/null
@@ -1,130 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-declare i32 @llvm.r600.read.tidig.x() readnone
-
-; FUNC-LABEL: {{^}}test_sub_i32:
-; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_subrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
-  %result = sub i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-
-; FUNC-LABEL: {{^}}test_sub_v2i32:
-; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
-  %result = sub <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_sub_v4i32:
-; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
-  %result = sub <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_sub_i64:
-; SI: s_sub_u32
-; SI: s_subb_u32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: SUB_INT {{[* ]*}}[[LO]]
-; EG-DAG: SUBB_UINT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT {{[* ]*}}[[HI]]
-; EG-NOT: SUB
-define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
-  %result = sub i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_sub_i64:
-; SI: v_sub_i32_e32
-; SI: v_subb_u32_e32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: SUB_INT {{[* ]*}}[[LO]]
-; EG-DAG: SUBB_UINT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT {{[* ]*}}[[HI]]
-; EG-NOT: SUB
-define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() readnone
-  %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
-  %a = load i64, i64 addrspace(1)* %a_ptr
-  %b = load i64, i64 addrspace(1)* %b_ptr
-  %result = sub i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_test_sub_v2i64:
-; SI: v_sub_i32_e32
-; SI: v_subb_u32_e32
-; SI: v_sub_i32_e32
-; SI: v_subb_u32_e32
-define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
-  %tid = call i32 @llvm.r600.read.tidig.x() readnone
-  %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
-  %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
-  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
-  %result = sub <2 x i64> %a, %b
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_test_sub_v4i64:
-; SI: v_sub_i32_e32
-; SI: v_subb_u32_e32
-; SI: v_sub_i32_e32
-; SI: v_subb_u32_e32
-; SI: v_sub_i32_e32
-; SI: v_subb_u32_e32
-; SI: v_sub_i32_e32
-; SI: v_subb_u32_e32
-define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
-  %tid = call i32 @llvm.r600.read.tidig.x() readnone
-  %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid
-  %a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr
-  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
-  %result = sub <4 x i64> %a, %b
-  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/subreg-coalescer-crash.ll b/test/CodeGen/R600/subreg-coalescer-crash.ll
deleted file mode 100644
index c4dae4736cf..00000000000
--- a/test/CodeGen/R600/subreg-coalescer-crash.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -o - %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s
-
-; SI-LABEL:{{^}}row_filter_C1_D0:
-; SI: s_endpgm
-; Function Attrs: nounwind
-define void @row_filter_C1_D0() {
-entry:
-  br i1 undef, label %for.inc.1, label %do.body.preheader
-
-do.body.preheader:                                ; preds = %entry
-  %0 = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1
-  br i1 undef, label %do.body56.1, label %do.body90
-
-do.body90:                                        ; preds = %do.body56.2, %do.body56.1, %do.body.preheader
-  %1 = phi <4 x i32> [ %6, %do.body56.2 ], [ %5, %do.body56.1 ], [ %0, %do.body.preheader ]
-  %2 = insertelement <4 x i32> %1, i32 undef, i32 2
-  %3 = insertelement <4 x i32> %2, i32 undef, i32 3
-  br i1 undef, label %do.body124.1, label %do.body.1562.preheader
-
-do.body.1562.preheader:                           ; preds = %do.body124.1, %do.body90
-  %storemerge = phi <4 x i32> [ %3, %do.body90 ], [ %7, %do.body124.1 ]
-  %4 = insertelement <4 x i32> undef, i32 undef, i32 1
-  br label %for.inc.1
-
-do.body56.1:                                      ; preds = %do.body.preheader
-  %5 = insertelement <4 x i32> %0, i32 undef, i32 1
-  %or.cond472.1 = or i1 undef, undef
-  br i1 %or.cond472.1, label %do.body56.2, label %do.body90
-
-do.body56.2:                                      ; preds = %do.body56.1
-  %6 = insertelement <4 x i32> %5, i32 undef, i32 1
-  br label %do.body90
-
-do.body124.1:                                     ; preds = %do.body90
-  %7 = insertelement <4 x i32> %3, i32 undef, i32 3
-  br label %do.body.1562.preheader
-
-for.inc.1:                                        ; preds = %do.body.1562.preheader, %entry
-  %storemerge591 = phi <4 x i32> [ zeroinitializer, %entry ], [ %storemerge, %do.body.1562.preheader ]
-  %add.i495 = add <4 x i32> %storemerge591, undef
-  unreachable
-}
-
-; SI-LABEL: {{^}}foo:
-; SI: s_endpgm
-define void @foo() #0 {
-bb:
-  br i1 undef, label %bb2, label %bb1
-
-bb1:                                              ; preds = %bb
-  br i1 undef, label %bb4, label %bb6
-
-bb2:                                              ; preds = %bb4, %bb
-  %tmp = phi float [ %tmp5, %bb4 ], [ 0.000000e+00, %bb ]
-  br i1 undef, label %bb9, label %bb13
-
-bb4:                                              ; preds = %bb7, %bb6, %bb1
-  %tmp5 = phi float [ undef, %bb1 ], [ undef, %bb6 ], [ %tmp8, %bb7 ]
-  br label %bb2
-
-bb6:                                              ; preds = %bb1
-  br i1 undef, label %bb7, label %bb4
-
-bb7:                                              ; preds = %bb6
-  %tmp8 = fmul float undef, undef
-  br label %bb4
-
-bb9:                                              ; preds = %bb2
-  %tmp10 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 2)
-  %tmp11 = extractelement <4 x float> %tmp10, i32 1
-  %tmp12 = extractelement <4 x float> %tmp10, i32 3
-  br label %bb14
-
-bb13:                                             ; preds = %bb2
-  br i1 undef, label %bb23, label %bb24
-
-bb14:                                             ; preds = %bb27, %bb24, %bb9
-  %tmp15 = phi float [ %tmp12, %bb9 ], [ undef, %bb27 ], [ 0.000000e+00, %bb24 ]
-  %tmp16 = phi float [ %tmp11, %bb9 ], [ undef, %bb27 ], [ %tmp25, %bb24 ]
-  %tmp17 = fmul float 10.5, %tmp16
-  %tmp18 = fmul float 11.5, %tmp15
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp18, float %tmp17, float %tmp17, float %tmp17)
-  ret void
-
-bb23:                                             ; preds = %bb13
-  br i1 undef, label %bb24, label %bb26
-
-bb24:                                             ; preds = %bb26, %bb23, %bb13
-  %tmp25 = phi float [ %tmp, %bb13 ], [ %tmp, %bb26 ], [ 0.000000e+00, %bb23 ]
-  br i1 undef, label %bb27, label %bb14
-
-bb26:                                             ; preds = %bb23
-  br label %bb24
-
-bb27:                                             ; preds = %bb24
-  br label %bb14
-}
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/subreg-eliminate-dead.ll b/test/CodeGen/R600/subreg-eliminate-dead.ll
deleted file mode 100644
index 8bd995a8ecb..00000000000
--- a/test/CodeGen/R600/subreg-eliminate-dead.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck %s
-; LiveRangeEdit::eliminateDeadDef did not update LiveInterval sub ranges
-; properly.
-
-; Just make sure this test doesn't crash.
-; CHECK-LABEL: foobar:
-; CHECK: s_endpgm
-define void @foobar() {
-  %v0 = icmp eq <4 x i32> undef, <i32 0, i32 1, i32 2, i32 3>
-  %v3 = sext <4 x i1> %v0 to <4 x i32>
-  %v4 = extractelement <4 x i32> %v3, i32 1
-  %v5 = icmp ne i32 %v4, 0
-  %v6 = select i1 %v5, i32 undef, i32 0
-  %v15 = insertelement <2 x i32> undef, i32 %v6, i32 1
-  store <2 x i32> %v15, <2 x i32> addrspace(1)* undef, align 8
-  ret void
-}
-
-declare double @llvm.fma.f64(double, double, double)
diff --git a/test/CodeGen/R600/swizzle-export.ll b/test/CodeGen/R600/swizzle-export.ll
deleted file mode 100644
index 000ee2faa47..00000000000
--- a/test/CodeGen/R600/swizzle-export.ll
+++ /dev/null
@@ -1,129 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-
-;EG: {{^}}main:
-;EG: EXPORT T{{[0-9]+}}.XYXX
-;EG: EXPORT T{{[0-9]+}}.ZXXX
-;EG: EXPORT T{{[0-9]+}}.XXWX
-;EG: EXPORT T{{[0-9]+}}.XXXW
-
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
-main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = extractelement <4 x float> %reg1, i32 2
-  %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = load <4 x float>, <4 x float> addrspace(8)* null
-  %5 = extractelement <4 x float> %4, i32 1
-  %6 = load <4 x float>, <4 x float> addrspace(8)* null
-  %7 = extractelement <4 x float> %6, i32 2
-  %8 = load <4 x float>, <4 x float> addrspace(8)* null
-  %9 = extractelement <4 x float> %8, i32 0
-  %10 = fmul float 0.000000e+00, %9
-  %11 = load <4 x float>, <4 x float> addrspace(8)* null
-  %12 = extractelement <4 x float> %11, i32 0
-  %13 = fmul float %5, %12
-  %14 = load <4 x float>, <4 x float> addrspace(8)* null
-  %15 = extractelement <4 x float> %14, i32 0
-  %16 = fmul float 0.000000e+00, %15
-  %17 = load <4 x float>, <4 x float> addrspace(8)* null
-  %18 = extractelement <4 x float> %17, i32 0
-  %19 = fmul float 0.000000e+00, %18
-  %20 = load <4 x float>, <4 x float> addrspace(8)* null
-  %21 = extractelement <4 x float> %20, i32 0
-  %22 = fmul float %7, %21
-  %23 = load <4 x float>, <4 x float> addrspace(8)* null
-  %24 = extractelement <4 x float> %23, i32 0
-  %25 = fmul float 0.000000e+00, %24
-  %26 = load <4 x float>, <4 x float> addrspace(8)* null
-  %27 = extractelement <4 x float> %26, i32 0
-  %28 = fmul float 0.000000e+00, %27
-  %29 = load <4 x float>, <4 x float> addrspace(8)* null
-  %30 = extractelement <4 x float> %29, i32 0
-  %31 = fmul float 0.000000e+00, %30
-  %32 = load <4 x float>, <4 x float> addrspace(8)* null
-  %33 = extractelement <4 x float> %32, i32 0
-  %34 = fmul float 0.000000e+00, %33
-  %35 = load <4 x float>, <4 x float> addrspace(8)* null
-  %36 = extractelement <4 x float> %35, i32 0
-  %37 = fmul float 0.000000e+00, %36
-  %38 = load <4 x float>, <4 x float> addrspace(8)* null
-  %39 = extractelement <4 x float> %38, i32 0
-  %40 = fmul float 1.000000e+00, %39
-  %41 = load <4 x float>, <4 x float> addrspace(8)* null
-  %42 = extractelement <4 x float> %41, i32 0
-  %43 = fmul float 0.000000e+00, %42
-  %44 = load <4 x float>, <4 x float> addrspace(8)* null
-  %45 = extractelement <4 x float> %44, i32 0
-  %46 = fmul float 0.000000e+00, %45
-  %47 = load <4 x float>, <4 x float> addrspace(8)* null
-  %48 = extractelement <4 x float> %47, i32 0
-  %49 = fmul float 0.000000e+00, %48
-  %50 = load <4 x float>, <4 x float> addrspace(8)* null
-  %51 = extractelement <4 x float> %50, i32 0
-  %52 = fmul float 0.000000e+00, %51
-  %53 = load <4 x float>, <4 x float> addrspace(8)* null
-  %54 = extractelement <4 x float> %53, i32 0
-  %55 = fmul float 1.000000e+00, %54
-  %56 = insertelement <4 x float> undef, float %0, i32 0
-  %57 = insertelement <4 x float> %56, float %1, i32 1
-  %58 = insertelement <4 x float> %57, float %2, i32 2
-  %59 = insertelement <4 x float> %58, float %3, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %59, i32 60, i32 1)
-  %60 = insertelement <4 x float> undef, float %10, i32 0
-  %61 = insertelement <4 x float> %60, float %13, i32 1
-  %62 = insertelement <4 x float> %61, float %16, i32 2
-  %63 = insertelement <4 x float> %62, float %19, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %63, i32 0, i32 2)
-  %64 = insertelement <4 x float> undef, float %22, i32 0
-  %65 = insertelement <4 x float> %64, float %25, i32 1
-  %66 = insertelement <4 x float> %65, float %28, i32 2
-  %67 = insertelement <4 x float> %66, float %31, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %67, i32 1, i32 2)
-  %68 = insertelement <4 x float> undef, float %34, i32 0
-  %69 = insertelement <4 x float> %68, float %37, i32 1
-  %70 = insertelement <4 x float> %69, float %40, i32 2
-  %71 = insertelement <4 x float> %70, float %43, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %71, i32 2, i32 2)
-  %72 = insertelement <4 x float> undef, float %46, i32 0
-  %73 = insertelement <4 x float> %72, float %49, i32 1
-  %74 = insertelement <4 x float> %73, float %52, i32 2
-  %75 = insertelement <4 x float> %74, float %55, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %75, i32 3, i32 2)
-  ret void
-}
-
-; EG: {{^}}main2:
-; EG: T{{[0-9]+}}.XY__
-; EG: T{{[0-9]+}}.ZXY0
-
-define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
-main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = fadd float %0, 2.5
-  %3 = fmul float %1, 3.5
-  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %5 = extractelement <4 x float> %4, i32 0
-  %6 = call float @llvm.cos.f32(float %5)
-  %7 = load <4 x float>, <4 x float> addrspace(8)* null
-  %8 = extractelement <4 x float> %7, i32 0
-  %9 = load <4 x float>, <4 x float> addrspace(8)* null
-  %10 = extractelement <4 x float> %9, i32 1
-  %11 = insertelement <4 x float> undef, float %2, i32 0
-  %12 = insertelement <4 x float> %11, float %3, i32 1
-  call void @llvm.R600.store.swizzle(<4 x float> %12, i32 60, i32 1)
-  %13 = insertelement <4 x float> undef, float %6, i32 0
-  %14 = insertelement <4 x float> %13, float %8, i32 1
-  %15 = insertelement <4 x float> %14, float %10, i32 2
-  %16 = insertelement <4 x float> %15, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %16, i32 0, i32 2)
-  ret void
-}
-
-; Function Attrs: nounwind readonly
-declare float @llvm.cos.f32(float) #1
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/R600/tex-clause-antidep.ll b/test/CodeGen/R600/tex-clause-antidep.ll
deleted file mode 100644
index cbb9c50974a..00000000000
--- a/test/CodeGen/R600/tex-clause-antidep.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: TEX
-;CHECK-NEXT: ALU
-
-define void @test(<4 x float> inreg %reg0) #0 {
-  %1 = extractelement <4 x float> %reg0, i32 0
-  %2 = extractelement <4 x float> %reg0, i32 1
-  %3 = extractelement <4 x float> %reg0, i32 2
-  %4 = extractelement <4 x float> %reg0, i32 3
-  %5 = insertelement <4 x float> undef, float %1, i32 0
-  %6 = insertelement <4 x float> %5, float %2, i32 1
-  %7 = insertelement <4 x float> %6, float %3, i32 2
-  %8 = insertelement <4 x float> %7, float %4, i32 3
-  %9 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %10 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %11 = fadd <4 x float> %9, %10
-  call void @llvm.R600.store.swizzle(<4 x float> %11, i32 0, i32 0)
-  ret void
-}
-
-declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
\ No newline at end of file
diff --git a/test/CodeGen/R600/texture-input-merge.ll b/test/CodeGen/R600/texture-input-merge.ll
deleted file mode 100644
index 789538af582..00000000000
--- a/test/CodeGen/R600/texture-input-merge.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK-NOT: MOV
-
-define void @test(<4 x float> inreg %reg0) #0 {
-  %1 = extractelement <4 x float> %reg0, i32 0
-  %2 = extractelement <4 x float> %reg0, i32 1
-  %3 = extractelement <4 x float> %reg0, i32 2
-  %4 = extractelement <4 x float> %reg0, i32 3
-  %5 = fmul float %1, 3.0
-  %6 = fmul float %2, 3.0
-  %7 = fmul float %3, 3.0
-  %8 = fmul float %4, 3.0
-  %9 = insertelement <4 x float> undef, float %5, i32 0
-  %10 = insertelement <4 x float> %9, float %6, i32 1
-  %11 = insertelement <4 x float> undef, float %7, i32 0
-  %12 = insertelement <4 x float> %11, float %5, i32 1
-  %13 = insertelement <4 x float> undef, float %8, i32 0
-  %14 = call <4 x float> @llvm.R600.tex(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %15 = call <4 x float> @llvm.R600.tex(<4 x float> %12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %16 = call <4 x float> @llvm.R600.tex(<4 x float> %13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %17 = fadd <4 x float> %14, %15
-  %18 = fadd <4 x float> %17, %16
-  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 0)
-  ret void
-}
-
-declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
\ No newline at end of file
diff --git a/test/CodeGen/R600/trunc-cmp-constant.ll b/test/CodeGen/R600/trunc-cmp-constant.ll
deleted file mode 100644
index dac74728b3c..00000000000
--- a/test/CodeGen/R600/trunc-cmp-constant.ll
+++ /dev/null
@@ -1,170 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_eq_0:
-; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
-; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
-; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1{{$}}
-; SI: v_cndmask_b32_e64
-; SI: buffer_store_byte
-define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = sext i1 %load to i32
-  %cmp = icmp eq i32 %ext, 0
-  store i1 %cmp, i1 addrspace(1)* %out
-  ret void
-}
-
-; FIXME: The negate should be inverting the compare.
-; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_0:
-; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
-; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
-; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = zext i1 %load to i32
-  %cmp = icmp eq i32 %ext, 0
-  store i1 %cmp, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1:
-; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
-; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = sext i1 %load to i32
-  %cmp = icmp eq i32 %ext, 1
-  store i1 %cmp, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_1:
-; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
-; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = zext i1 %load to i32
-  %cmp = icmp eq i32 %ext, 1
-  store i1 %cmp, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_neg1:
-; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
-; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = sext i1 %load to i32
-  %cmp = icmp eq i32 %ext, -1
-  store i1 %cmp, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1:
-; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
-; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = zext i1 %load to i32
-  %cmp = icmp eq i32 %ext, -1
-  store i1 %cmp, i1 addrspace(1)* %out
-  ret void
-}
-
-
-; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_ne_0:
-; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
-; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = sext i1 %load to i32
-  %cmp = icmp ne i32 %ext, 0
-  store i1 %cmp, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_0:
-; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
-; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = zext i1 %load to i32
-  %cmp = icmp ne i32 %ext, 0
-  store i1 %cmp, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1:
-; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
-; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = sext i1 %load to i32
-  %cmp = icmp ne i32 %ext, 1
-  store i1 %cmp, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_1:
-; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
-; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
-; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = zext i1 %load to i32
-  %cmp = icmp ne i32 %ext, 1
-  store i1 %cmp, i1 addrspace(1)* %out
-  ret void
-}
-
-; FIXME: This should be one compare.
-; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_neg1:
-; XSI: buffer_load_ubyte [[LOAD:v[0-9]+]]
-; XSI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; XSI: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}}
-; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]]
-; XSI-NEXT: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = sext i1 %load to i32
-  %cmp = icmp ne i32 %ext, -1
-  store i1 %cmp, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1:
-; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
-; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = zext i1 %load to i32
-  %cmp = icmp ne i32 %ext, -1
-  store i1 %cmp, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}masked_load_i1_to_i32_trunc_cmp_ne_neg1:
-; SI: buffer_load_sbyte [[LOAD:v[0-9]+]]
-; SI: v_cmp_ne_i32_e32 vcc, -1, [[LOAD]]{{$}}
-; SI-NEXT: v_cndmask_b32_e64
-; SI-NEXT: buffer_store_byte
-define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %load = load i8, i8 addrspace(1)* %in
-  %masked = and i8 %load, 255
-  %ext = sext i8 %masked to i32
-  %cmp = icmp ne i32 %ext, -1
-  store i1 %cmp, i1 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/trunc-store-f64-to-f16.ll b/test/CodeGen/R600/trunc-store-f64-to-f16.ll
deleted file mode 100644
index c29872beef8..00000000000
--- a/test/CodeGen/R600/trunc-store-f64-to-f16.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; XFAIL: *
-; RUN: llc -march=amdgcn -mcpu=SI < %s
-
-; GCN-LABEL: {{^}}global_truncstore_f64_to_f16:
-; GCN: s_endpgm
-define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %val = load double, double addrspace(1)* %in
-  %cvt = fptrunc double %val to half
-  store half %cvt, half addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16:
-; GCN: s_endpgm
-define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
-  %val = load <2 x double>, <2 x double> addrspace(1)* %in
-  %cvt = fptrunc <2 x double> %val to <2 x half>
-  store <2 x half> %cvt, <2 x half> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16:
-; GCN: s_endpgm
-define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
-  %val = load <3 x double>, <3 x double> addrspace(1)* %in
-  %cvt = fptrunc <3 x double> %val to <3 x half>
-  store <3 x half> %cvt, <3 x half> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16:
-; GCN: s_endpgm
-define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
-  %val = load <4 x double>, <4 x double> addrspace(1)* %in
-  %cvt = fptrunc <4 x double> %val to <4 x half>
-  store <4 x half> %cvt, <4 x half> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16:
-; GCN: s_endpgm
-define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
-  %val = load <8 x double>, <8 x double> addrspace(1)* %in
-  %cvt = fptrunc <8 x double> %val to <8 x half>
-  store <8 x half> %cvt, <8 x half> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16:
-; GCN: s_endpgm
-define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
-  %val = load <16 x double>, <16 x double> addrspace(1)* %in
-  %cvt = fptrunc <16 x double> %val to <16 x half>
-  store <16 x half> %cvt, <16 x half> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/trunc-store-i1.ll b/test/CodeGen/R600/trunc-store-i1.ll
deleted file mode 100644
index b71a838b62c..00000000000
--- a/test/CodeGen/R600/trunc-store-i1.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-
-
-; SI-LABEL: {{^}}global_truncstore_i32_to_i1:
-; SI: s_load_dword [[LOAD:s[0-9]+]],
-; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
-; SI: buffer_store_byte [[VREG]],
-define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
-  %trunc = trunc i32 %val to i1
-  store i1 %trunc, i1 addrspace(1)* %out, align 1
-  ret void
-}
-
-; SI-LABEL: {{^}}global_truncstore_i64_to_i1:
-; SI: buffer_store_byte
-define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
-  %trunc = trunc i64 %val to i1
-  store i1 %trunc, i1 addrspace(1)* %out, align 1
-  ret void
-}
-
-; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
-; SI: s_load_dword [[LOAD:s[0-9]+]],
-; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
-; SI: buffer_store_byte [[VREG]],
-define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
-  %trunc = trunc i16 %val to i1
-  store i1 %trunc, i1 addrspace(1)* %out, align 1
-  ret void
-}
diff --git a/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll b/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll
deleted file mode 100644
index 878ea3f4899..00000000000
--- a/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; This tests for a bug in the SelectionDAG where custom lowered truncated
-; vector stores at the end of a basic block were not being added to the
-; LegalizedNodes list, which triggered an assertion failure.
-
-; CHECK-LABEL: {{^}}test:
-; CHECK: MEM_RAT_CACHELESS STORE_RAW
-define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
-entry:
-  %0 = icmp eq i32 %cond, 0
-  br i1 %0, label %if, label %done
-
-if:
-  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
-  br label %done
-
-done:
-  ret void
-}
diff --git a/test/CodeGen/R600/trunc.ll b/test/CodeGen/R600/trunc.ll
deleted file mode 100644
index bf690ca4cb2..00000000000
--- a/test/CodeGen/R600/trunc.ll
+++ /dev/null
@@ -1,100 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
-; SI-LABEL: {{^}}trunc_i64_to_i32_store:
-; SI: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], 0xb
-; SI: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
-; SI: buffer_store_dword [[VLOAD]]
-
-; EG-LABEL: {{^}}trunc_i64_to_i32_store:
-; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
-; EG: LSHR
-; EG-NEXT: 2(
-
-  %result = trunc i64 %in to i32 store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}trunc_load_shl_i64:
-; SI-DAG: s_load_dwordx2
-; SI-DAG: s_load_dword [[SREG:s[0-9]+]],
-; SI: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2
-; SI: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]]
-; SI: buffer_store_dword [[VSHL]],
-define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
-  %b = shl i64 %a, 2
-  %result = trunc i64 %b to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}trunc_shl_i64:
-; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2
-; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
-; SI: s_addc_u32
-; SI: v_mov_b32_e32
-; SI: v_mov_b32_e32
-; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
-; SI: buffer_store_dword v[[LO_VREG]],
-define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
-  %aa = add i64 %a, 234 ; Prevent shrinking store.
-  %b = shl i64 %aa, 2
-  %result = trunc i64 %b to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  store i64 %b, i64 addrspace(1)* %out2, align 8 ; Prevent reducing ops to 32-bits
-  ret void
-}
-
-; SI-LABEL: {{^}}trunc_i32_to_i1:
-; SI: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI: v_cmp_eq_i32
-define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
-  %a = load i32, i32 addrspace(1)* %ptr, align 4
-  %trunc = trunc i32 %a to i1
-  %result = select i1 %trunc, i32 1, i32 0
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}sgpr_trunc_i32_to_i1:
-; SI: v_and_b32_e64 v{{[0-9]+}}, 1, s{{[0-9]+}}
-; SI: v_cmp_eq_i32
-define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
-  %trunc = trunc i32 %a to i1
-  %result = select i1 %trunc, i32 1, i32 0
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}s_trunc_i64_to_i1:
-; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: v_and_b32_e64 [[MASKED:v[0-9]+]], 1, s[[SLO]]
-; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]]
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
-define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
-  %trunc = trunc i64 %x to i1
-  %sel = select i1 %trunc, i32 63, i32 -12
-  store i32 %sel, i32 addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}v_trunc_i64_to_i1:
-; SI: buffer_load_dwordx2 v{{\[}}[[VLO:[0-9]+]]:{{[0-9]+\]}}
-; SI: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
-; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]]
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
-define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %x = load i64, i64 addrspace(1)* %gep
-
-  %trunc = trunc i64 %x to i1
-  %sel = select i1 %trunc, i32 63, i32 -12
-  store i32 %sel, i32 addrspace(1)* %out.gep
-  ret void
-}
diff --git a/test/CodeGen/R600/tti-unroll-prefs.ll b/test/CodeGen/R600/tti-unroll-prefs.ll
deleted file mode 100644
index 76c32afc1f2..00000000000
--- a/test/CodeGen/R600/tti-unroll-prefs.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: opt -loop-unroll -S -mtriple=amdgcn-- -mcpu=SI %s | FileCheck %s
-
-; This IR comes from this OpenCL C code:
-;
-; if (b + 4 > a) {
-;   for (int i = 0; i < 4; i++, b++) {
-;     if (b + 1 <= a)
-;       *(dst + c + b) = 0;
-;     else
-;       break;
-;   }
-; }
-;
-; This test is meant to check that this loop isn't unrolled into more than
-; four iterations.  The loop unrolling preferences we currently use cause this
-; loop to not be unrolled at all, but that may change in the future.
-
-; CHECK-LABEL: @test
-; CHECK: store i8 0, i8 addrspace(1)*
-; CHECK-NOT: store i8 0, i8 addrspace(1)*
-; CHECK: ret void
-define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
-entry:
-  %add = add nsw i32 %b, 4
-  %cmp = icmp sgt i32 %add, %a
-  br i1 %cmp, label %for.cond.preheader, label %if.end7
-
-for.cond.preheader:                               ; preds = %entry
-  %cmp313 = icmp slt i32 %b, %a
-  br i1 %cmp313, label %if.then4.lr.ph, label %if.end7.loopexit
-
-if.then4.lr.ph:                                   ; preds = %for.cond.preheader
-  %0 = sext i32 %c to i64
-  br label %if.then4
-
-if.then4:                                         ; preds = %if.then4.lr.ph, %if.then4
-  %i.015 = phi i32 [ 0, %if.then4.lr.ph ], [ %inc, %if.then4 ]
-  %b.addr.014 = phi i32 [ %b, %if.then4.lr.ph ], [ %add2, %if.then4 ]
-  %add2 = add nsw i32 %b.addr.014, 1
-  %1 = sext i32 %b.addr.014 to i64
-  %add.ptr.sum = add nsw i64 %1, %0
-  %add.ptr5 = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %add.ptr.sum
-  store i8 0, i8 addrspace(1)* %add.ptr5, align 1
-  %inc = add nsw i32 %i.015, 1
-  %cmp1 = icmp slt i32 %inc, 4
-  %cmp3 = icmp slt i32 %add2, %a
-  %or.cond = and i1 %cmp3, %cmp1
-  br i1 %or.cond, label %if.then4, label %for.cond.if.end7.loopexit_crit_edge
-
-for.cond.if.end7.loopexit_crit_edge:              ; preds = %if.then4
-  br label %if.end7.loopexit
-
-if.end7.loopexit:                                 ; preds = %for.cond.if.end7.loopexit_crit_edge, %for.cond.preheader
-  br label %if.end7
-
-if.end7:                                          ; preds = %if.end7.loopexit, %entry
-  ret void
-}
diff --git a/test/CodeGen/R600/uaddo.ll b/test/CodeGen/R600/uaddo.ll
deleted file mode 100644
index 11438f267ad..00000000000
--- a/test/CodeGen/R600/uaddo.ll
+++ /dev/null
@@ -1,85 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
-declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
-
-; FUNC-LABEL: {{^}}uaddo_i64_zext:
-; SI: add
-; SI: addc
-; SI: addc
-
-; EG: ADDC_UINT
-; EG: ADDC_UINT
-define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
-  %val = extractvalue { i64, i1 } %uadd, 0
-  %carry = extractvalue { i64, i1 } %uadd, 1
-  %ext = zext i1 %carry to i64
-  %add2 = add i64 %val, %ext
-  store i64 %add2, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_uaddo_i32:
-; SI: s_add_i32
-
-; EG: ADDC_UINT
-; EG: ADD_INT
-define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
-  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind
-  %val = extractvalue { i32, i1 } %uadd, 0
-  %carry = extractvalue { i32, i1 } %uadd, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_uaddo_i32:
-; SI: v_add_i32
-
-; EG: ADDC_UINT
-; EG: ADD_INT
-define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
-  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind
-  %val = extractvalue { i32, i1 } %uadd, 0
-  %carry = extractvalue { i32, i1 } %uadd, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_uaddo_i64:
-; SI: s_add_u32
-; SI: s_addc_u32
-
-; EG: ADDC_UINT
-; EG: ADD_INT
-define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
-  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
-  %val = extractvalue { i64, i1 } %uadd, 0
-  %carry = extractvalue { i64, i1 } %uadd, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_uaddo_i64:
-; SI: v_add_i32
-; SI: v_addc_u32
-
-; EG: ADDC_UINT
-; EG: ADD_INT
-define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %a = load i64, i64 addrspace(1)* %aptr, align 4
-  %b = load i64, i64 addrspace(1)* %bptr, align 4
-  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
-  %val = extractvalue { i64, i1 } %uadd, 0
-  %carry = extractvalue { i64, i1 } %uadd, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll
deleted file mode 100644
index de22a22e502..00000000000
--- a/test/CodeGen/R600/udiv.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
-
-;EG-LABEL: {{^}}test:
-;EG-NOT: SETGE_INT
-;EG: CF_END
-
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1) * %in
-  %b = load i32, i32 addrspace(1) * %b_ptr
-  %result = udiv i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-;The code generated by udiv is long and complex and may frequently change.
-;The goal of this test is to make sure the ISel doesn't fail when it gets
-;a v4i32 udiv
-
-;EG-LABEL: {{^}}test2:
-;EG: CF_END
-;SI-LABEL: {{^}}test2:
-;SI: s_endpgm
-
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
-  %result = udiv <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-;EG-LABEL: {{^}}test4:
-;EG: CF_END
-;SI-LABEL: {{^}}test4:
-;SI: s_endpgm
-
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
-  %result = udiv <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/udivrem.ll b/test/CodeGen/R600/udivrem.ll
deleted file mode 100644
index b3837f28209..00000000000
--- a/test/CodeGen/R600/udivrem.ll
+++ /dev/null
@@ -1,345 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_udivrem:
-; EG: RECIP_UINT
-; EG-DAG: MULHI
-; EG-DAG: MULLO_INT
-; EG-DAG: SUB_INT
-; EG: CNDE_INT
-; EG: MULHI
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG: CNDE_INT
-; EG: MULHI
-; EG: MULLO_INT
-; EG: SUB_INT
-; EG-DAG: SETGE_UINT
-; EG-DAG: SETGE_UINT
-; EG: AND_INT
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-
-; SI: v_rcp_iflag_f32_e32 [[RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]]
-; SI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]]
-; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]]
-; SI: v_cndmask_b32_e64
-; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]]
-; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]]
-; SI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]]
-; SI: v_cndmask_b32_e64
-; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]]
-; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI: s_endpgm
-define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
-  %result0 = udiv i32 %x, %y
-  store i32 %result0, i32 addrspace(1)* %out
-  %result1 = urem i32 %x, %y
-  store i32 %result1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_udivrem_v2:
-; EG-DAG: RECIP_UINT
-; EG-DAG: MULHI
-; EG-DAG: MULLO_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: MULHI
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: MULHI
-; EG-DAG: MULLO_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SETGE_UINT
-; EG-DAG: SETGE_UINT
-; EG-DAG: AND_INT
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: RECIP_UINT
-; EG-DAG: MULHI
-; EG-DAG: MULLO_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: MULHI
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: MULHI
-; EG-DAG: MULLO_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SETGE_UINT
-; EG-DAG: SETGE_UINT
-; EG-DAG: AND_INT
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-
-; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
-; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
-; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
-; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
-; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI: s_endpgm
-define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
-  %result0 = udiv <2 x i32> %x, %y
-  store <2 x i32> %result0, <2 x i32> addrspace(1)* %out
-  %result1 = urem <2 x i32> %x, %y
-  store <2 x i32> %result1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-
-; FUNC-LABEL: {{^}}test_udivrem_v4:
-; EG-DAG: RECIP_UINT
-; EG-DAG: MULHI
-; EG-DAG: MULLO_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: MULHI
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: MULHI
-; EG-DAG: MULLO_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SETGE_UINT
-; EG-DAG: SETGE_UINT
-; EG-DAG: AND_INT
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: RECIP_UINT
-; EG-DAG: MULHI
-; EG-DAG: MULLO_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: MULHI
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: MULHI
-; EG-DAG: MULLO_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SETGE_UINT
-; EG-DAG: SETGE_UINT
-; EG-DAG: AND_INT
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: RECIP_UINT
-; EG-DAG: MULHI
-; EG-DAG: MULLO_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: MULHI
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: MULHI
-; EG-DAG: MULLO_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SETGE_UINT
-; EG-DAG: SETGE_UINT
-; EG-DAG: AND_INT
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: RECIP_UINT
-; EG-DAG: MULHI
-; EG-DAG: MULLO_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: MULHI
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: MULHI
-; EG-DAG: MULLO_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SETGE_UINT
-; EG-DAG: SETGE_UINT
-; EG-DAG: AND_INT
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: ADD_INT
-; EG-DAG: SUB_INT
-; EG-DAG: CNDE_INT
-; EG-DAG: CNDE_INT
-
-; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
-; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
-; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[l0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
-; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
-; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_rcp_iflag_f32_e32 [[THIRD_RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]]
-; SI-DAG: v_mul_lo_i32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]]
-; SI-DAG: v_sub_i32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]]
-; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[THIRD_Quotient:v[0-9]+]]
-; SI-DAG: v_mul_lo_i32 [[THIRD_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder:v[0-9]+]], [[THIRD_Num_S_Remainder]], {{v[0-9]+}}
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32 [[THIRD_Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[THIRD_Quotient_S_One:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_rcp_iflag_f32_e32 [[FOURTH_RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]]
-; SI-DAG: v_mul_lo_i32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]]
-; SI-DAG: v_sub_i32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]]
-; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
-; SI-DAG: v_cndmask_b32_e64
-; SI: s_endpgm
-define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
-  %result0 = udiv <4 x i32> %x, %y
-  store <4 x i32> %result0, <4 x i32> addrspace(1)* %out
-  %result1 = urem <4 x i32> %x, %y
-  store <4 x i32> %result1, <4 x i32> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/udivrem24.ll b/test/CodeGen/R600/udivrem24.ll
deleted file mode 100644
index 4de881b66f1..00000000000
--- a/test/CodeGen/R600/udivrem24.ll
+++ /dev/null
@@ -1,245 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}udiv24_i8:
-; SI: v_cvt_f32_ubyte
-; SI: v_cvt_f32_ubyte
-; SI: v_rcp_f32
-; SI: v_cvt_u32_f32
-
-; EG: UINT_TO_FLT
-; EG-DAG: UINT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_UINT
-define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %num = load i8, i8 addrspace(1) * %in
-  %den = load i8, i8 addrspace(1) * %den_ptr
-  %result = udiv i8 %num, %den
-  store i8 %result, i8 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}udiv24_i16:
-; SI: v_cvt_f32_u32
-; SI: v_cvt_f32_u32
-; SI: v_rcp_f32
-; SI: v_cvt_u32_f32
-
-; EG: UINT_TO_FLT
-; EG-DAG: UINT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_UINT
-define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
-  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
-  %num = load i16, i16 addrspace(1) * %in, align 2
-  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
-  %result = udiv i16 %num, %den
-  store i16 %result, i16 addrspace(1)* %out, align 2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}udiv24_i32:
-; SI: v_cvt_f32_u32
-; SI-DAG: v_cvt_f32_u32
-; SI-DAG: v_rcp_f32
-; SI: v_cvt_u32_f32
-
-; EG: UINT_TO_FLT
-; EG-DAG: UINT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_UINT
-define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 8
-  %den.i24.0 = shl i32 %den, 8
-  %num.i24 = lshr i32 %num.i24.0, 8
-  %den.i24 = lshr i32 %den.i24.0, 8
-  %result = udiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}udiv25_i32:
-; RCP_IFLAG is for URECIP in the full 32b alg
-; SI: v_rcp_iflag
-; SI-NOT: v_rcp_f32
-
-; EG-NOT: UINT_TO_FLT
-; EG-NOT: RECIP_IEEE
-define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 7
-  %den.i24.0 = shl i32 %den, 7
-  %num.i24 = lshr i32 %num.i24.0, 7
-  %den.i24 = lshr i32 %den.i24.0, 7
-  %result = udiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_no_udiv24_i32_1:
-; RCP_IFLAG is for URECIP in the full 32b alg
-; SI: v_rcp_iflag
-; SI-NOT: v_rcp_f32
-
-; EG-NOT: UINT_TO_FLT
-; EG-NOT: RECIP_IEEE
-define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 8
-  %den.i24.0 = shl i32 %den, 7
-  %num.i24 = lshr i32 %num.i24.0, 8
-  %den.i24 = lshr i32 %den.i24.0, 7
-  %result = udiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_no_udiv24_i32_2:
-; RCP_IFLAG is for URECIP in the full 32b alg
-; SI: v_rcp_iflag
-; SI-NOT: v_rcp_f32
-
-; EG-NOT: UINT_TO_FLT
-; EG-NOT: RECIP_IEEE
-define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 7
-  %den.i24.0 = shl i32 %den, 8
-  %num.i24 = lshr i32 %num.i24.0, 7
-  %den.i24 = lshr i32 %den.i24.0, 8
-  %result = udiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}urem24_i8:
-; SI: v_cvt_f32_ubyte
-; SI: v_cvt_f32_ubyte
-; SI: v_rcp_f32
-; SI: v_cvt_u32_f32
-
-; EG: UINT_TO_FLT
-; EG-DAG: UINT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_UINT
-define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %num = load i8, i8 addrspace(1) * %in
-  %den = load i8, i8 addrspace(1) * %den_ptr
-  %result = urem i8 %num, %den
-  store i8 %result, i8 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}urem24_i16:
-; SI: v_cvt_f32_u32
-; SI: v_cvt_f32_u32
-; SI: v_rcp_f32
-; SI: v_cvt_u32_f32
-
-; EG: UINT_TO_FLT
-; EG-DAG: UINT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_UINT
-define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
-  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
-  %num = load i16, i16 addrspace(1) * %in, align 2
-  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
-  %result = urem i16 %num, %den
-  store i16 %result, i16 addrspace(1)* %out, align 2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}urem24_i32:
-; SI: v_cvt_f32_u32
-; SI: v_cvt_f32_u32
-; SI: v_rcp_f32
-; SI: v_cvt_u32_f32
-
-; EG: UINT_TO_FLT
-; EG-DAG: UINT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_UINT
-define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 8
-  %den.i24.0 = shl i32 %den, 8
-  %num.i24 = lshr i32 %num.i24.0, 8
-  %den.i24 = lshr i32 %den.i24.0, 8
-  %result = urem i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}urem25_i32:
-; RCP_IFLAG is for URECIP in the full 32b alg
-; SI: v_rcp_iflag
-; SI-NOT: v_rcp_f32
-
-; EG-NOT: UINT_TO_FLT
-; EG-NOT: RECIP_IEEE
-define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 7
-  %den.i24.0 = shl i32 %den, 7
-  %num.i24 = lshr i32 %num.i24.0, 7
-  %den.i24 = lshr i32 %den.i24.0, 7
-  %result = urem i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_no_urem24_i32_1:
-; RCP_IFLAG is for URECIP in the full 32b alg
-; SI: v_rcp_iflag
-; SI-NOT: v_rcp_f32
-
-; EG-NOT: UINT_TO_FLT
-; EG-NOT: RECIP_IEEE
-define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 8
-  %den.i24.0 = shl i32 %den, 7
-  %num.i24 = lshr i32 %num.i24.0, 8
-  %den.i24 = lshr i32 %den.i24.0, 7
-  %result = urem i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_no_urem24_i32_2:
-; RCP_IFLAG is for URECIP in the full 32b alg
-; SI: v_rcp_iflag
-; SI-NOT: v_rcp_f32
-
-; EG-NOT: UINT_TO_FLT
-; EG-NOT: RECIP_IEEE
-define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 7
-  %den.i24.0 = shl i32 %den, 8
-  %num.i24 = lshr i32 %num.i24.0, 7
-  %den.i24 = lshr i32 %den.i24.0, 8
-  %result = urem i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/udivrem64.ll b/test/CodeGen/R600/udivrem64.ll
deleted file mode 100644
index 9f3069bdf80..00000000000
--- a/test/CodeGen/R600/udivrem64.ll
+++ /dev/null
@@ -1,223 +0,0 @@
-;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
-;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
-;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-
-;FUNC-LABEL: {{^}}test_udiv:
-;EG: RECIP_UINT
-;EG: LSHL {{.*}}, 1,
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN-NOT: v_mad_f32
-;SI-NOT: v_lshr_b64
-;VI-NOT: v_lshrrev_b64
-;GCN: s_endpgm
-define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
-  %result = udiv i64 %x, %y
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-;FUNC-LABEL: {{^}}test_urem:
-;EG: RECIP_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: AND_INT {{.*}}, 1,
-
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN: s_bfe_u32
-;GCN-NOT: v_mad_f32
-;SI-NOT: v_lshr_b64
-;VI-NOT: v_lshrrev_b64
-;GCN: s_endpgm
-define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
-  %result = urem i64 %x, %y
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-;FUNC-LABEL: {{^}}test_udiv3264:
-;EG: RECIP_UINT
-;EG-NOT: BFE_UINT
-
-;GCN-NOT: s_bfe_u32
-;GCN-NOT: v_mad_f32
-;SI-NOT: v_lshr_b64
-;VI-NOT: v_lshrrev_b64
-;GCN: s_endpgm
-define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
-  %1 = lshr i64 %x, 33
-  %2 = lshr i64 %y, 33
-  %result = udiv i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-;FUNC-LABEL: {{^}}test_urem3264:
-;EG: RECIP_UINT
-;EG-NOT: BFE_UINT
-
-;GCN-NOT: s_bfe_u32
-;GCN-NOT: v_mad_f32
-;SI-NOT: v_lshr_b64
-;VI-NOT: v_lshrrev_b64
-;GCN: s_endpgm
-define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
-  %1 = lshr i64 %x, 33
-  %2 = lshr i64 %y, 33
-  %result = urem i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-;FUNC-LABEL: {{^}}test_udiv2464:
-;EG: UINT_TO_FLT
-;EG: UINT_TO_FLT
-;EG: FLT_TO_UINT
-;EG-NOT: RECIP_UINT
-;EG-NOT: BFE_UINT
-
-;SI-NOT: v_lshr_b64
-;VI-NOT: v_lshrrev_b64
-;GCN: v_mad_f32
-;GCN: s_endpgm
-define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
-  %1 = lshr i64 %x, 40
-  %2 = lshr i64 %y, 40
-  %result = udiv i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-;FUNC-LABEL: {{^}}test_urem2464:
-;EG: UINT_TO_FLT
-;EG: UINT_TO_FLT
-;EG: FLT_TO_UINT
-;EG-NOT: RECIP_UINT
-;EG-NOT: BFE_UINT
-
-;SI-NOT: v_lshr_b64
-;VI-NOT: v_lshrrev_b64
-;GCN: v_mad_f32
-;GCN: s_endpgm
-define void @test_urem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
-  %1 = lshr i64 %x, 40
-  %2 = lshr i64 %y, 40
-  %result = urem i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/uint_to_fp.f64.ll b/test/CodeGen/R600/uint_to_fp.f64.ll
deleted file mode 100644
index dfec8eb15cb..00000000000
--- a/test/CodeGen/R600/uint_to_fp.f64.ll
+++ /dev/null
@@ -1,98 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; SI-LABEL: {{^}}v_uint_to_fp_i64_to_f64
-; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; SI: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
-; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
-; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
-; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep, align 8
-  %result = uitofp i64 %val to double
-  store double %result, double addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64
-define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
-  %cast = uitofp i64 %in to double
-  store double %cast, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64
-define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
-  %cast = uitofp <2 x i64> %in to <2 x double>
-  store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64
-define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
-  %cast = uitofp <4 x i64> %in to <4 x double>
-  store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64
-; SI: v_cvt_f64_u32_e32
-; SI: s_endpgm
-define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
-  %cast = uitofp i32 %in to double
-  store double %cast, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}s_uint_to_fp_v2i32_to_v2f64
-; SI: v_cvt_f64_u32_e32
-; SI: v_cvt_f64_u32_e32
-; SI: s_endpgm
-define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) {
-  %cast = uitofp <2 x i32> %in to <2 x double>
-  store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: {{^}}s_uint_to_fp_v4i32_to_v4f64
-; SI: v_cvt_f64_u32_e32
-; SI: v_cvt_f64_u32_e32
-; SI: v_cvt_f64_u32_e32
-; SI: v_cvt_f64_u32_e32
-; SI: s_endpgm
-define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) {
-  %cast = uitofp <4 x i32> %in to <4 x double>
-  store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; FIXME: select on 0, 0
-; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
-; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
-; uses an SGPR for [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
-  %cmp = icmp eq i32 %in, 0
-  %fp = uitofp i1 %cmp to double
-  store double %fp, double addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}uint_to_fp_i1_to_f64_load:
-; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, 1
-; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-; SI: s_endpgm
-define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) {
-  %fp = uitofp i1 %in to double
-  store double %fp, double addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll
deleted file mode 100644
index 00fea80b1bc..00000000000
--- a/test/CodeGen/R600/uint_to_fp.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}uint_to_fp_i32_to_f32:
-; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-
-; SI: v_cvt_f32_u32_e32
-; SI: s_endpgm
-define void @uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) {
-  %result = uitofp i32 %in to float
-  store float %result, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}uint_to_fp_v2i32_to_v2f32:
-; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
-; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-
-; SI: v_cvt_f32_u32_e32
-; SI: v_cvt_f32_u32_e32
-; SI: s_endpgm
-define void @uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
-  %result = uitofp <2 x i32> %in to <2 x float>
-  store <2 x float> %result, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}uint_to_fp_v4i32_to_v4f32:
-; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_cvt_f32_u32_e32
-; SI: v_cvt_f32_u32_e32
-; SI: v_cvt_f32_u32_e32
-; SI: v_cvt_f32_u32_e32
-; SI: s_endpgm
-define void @uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %result = uitofp <4 x i32> %value to <4 x float>
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}uint_to_fp_i64_to_f32:
-; R600: UINT_TO_FLT
-; R600: UINT_TO_FLT
-; R600: MULADD_IEEE
-; SI: v_cvt_f32_u32_e32
-; SI: v_cvt_f32_u32_e32
-; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x4f800000
-; SI: s_endpgm
-define void @uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) {
-entry:
-  %0 = uitofp i64 %in to float
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32:
-; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define void @uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) {
-  %cmp = icmp eq i32 %in, 0
-  %fp = uitofp i1 %cmp to float
-  store float %fp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32_load:
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define void @uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) {
-  %fp = uitofp i1 %in to float
-  store float %fp, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/unaligned-load-store.ll b/test/CodeGen/R600/unaligned-load-store.ll
deleted file mode 100644
index 82d88ebd3ae..00000000000
--- a/test/CodeGen/R600/unaligned-load-store.ll
+++ /dev/null
@@ -1,254 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}unaligned_load_store_i16_local:
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: s_endpgm
-define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind {
-  %v = load i16, i16 addrspace(3)* %p, align 1
-  store i16 %v, i16 addrspace(3)* %r, align 1
-  ret void
-}
-
-; SI-LABEL: {{^}}unaligned_load_store_i16_global:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: s_endpgm
-define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind {
-  %v = load i16, i16 addrspace(1)* %p, align 1
-  store i16 %v, i16 addrspace(1)* %r, align 1
-  ret void
-}
-
-; SI-LABEL: {{^}}unaligned_load_store_i32_local:
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: s_endpgm
-define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
-  %v = load i32, i32 addrspace(3)* %p, align 1
-  store i32 %v, i32 addrspace(3)* %r, align 1
-  ret void
-}
-
-; SI-LABEL: {{^}}unaligned_load_store_i32_global:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind {
-  %v = load i32, i32 addrspace(1)* %p, align 1
-  store i32 %v, i32 addrspace(1)* %r, align 1
-  ret void
-}
-
-; SI-LABEL: {{^}}unaligned_load_store_i64_local:
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: s_endpgm
-define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) {
-  %v = load i64, i64 addrspace(3)* %p, align 1
-  store i64 %v, i64 addrspace(3)* %r, align 1
-  ret void
-}
-
-; SI-LABEL: {{^}}unaligned_load_store_i64_global:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) {
-  %v = load i64, i64 addrspace(1)* %p, align 1
-  store i64 %v, i64 addrspace(1)* %r, align 1
-  ret void
-}
-
-; SI-LABEL: {{^}}unaligned_load_store_v4i32_local:
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: s_endpgm
-define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
-  %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
-  store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
-  ret void
-}
-
-; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded.
-; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind {
-  %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
-  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
-  ret void
-}
-
-; SI-LABEL: {{^}}load_lds_i64_align_4:
-; SI: ds_read2_b32
-; SI: s_endpgm
-define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
-  %val = load i64, i64 addrspace(3)* %in, align 4
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
-; SI: s_endpgm
-define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
-  %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
-  %val = load i64, i64 addrspace(3)* %ptr, align 4
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset:
-; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
-; SI: s_endpgm
-define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
-  %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
-  %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
-  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
-  %val = load i64, i64 addrspace(3)* %ptri64, align 4
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}load_lds_i64_align_1:
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-
-define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
-  %val = load i64, i64 addrspace(3)* %in, align 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: {{^}}store_lds_i64_align_4:
-; SI: ds_write2_b32
-; SI: s_endpgm
-define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
-  store i64 %val, i64 addrspace(3)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset
-; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
-; SI: s_endpgm
-define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
-  %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
-  store i64 0, i64 addrspace(3)* %ptr, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset:
-; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
-; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; SI: s_endpgm
-define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
-  %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
-  %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
-  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
-  store i64 0, i64 addrspace(3)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/unhandled-loop-condition-assertion.ll b/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
deleted file mode 100644
index 036a7e91b47..00000000000
--- a/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
+++ /dev/null
@@ -1,115 +0,0 @@
-; REQUIRES: asserts
-; XFAIL: *
-; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
-; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
-; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s
-
-; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
-
-; COMMON-LABEL: {{^}}branch_true:
-define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
-entry:
-  br i1 true, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %entry
-  %add.ptr.sum = shl i32 %main_stride, 1
-  %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride
-  %add.ptr4.sum = shl i32 %main_stride, 2
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
-  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
-  %1 = load i32, i32 addrspace(1)* %0, align 4
-  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
-  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %3 = load i32, i32 addrspace(1)* %2, align 4
-  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
-  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  %5 = load i32, i32 addrspace(1)* %4, align 4
-  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
-  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
-  %7 = load i32, i32 addrspace(1)* %6, align 4
-  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
-  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
-  %9 = load i32, i32 addrspace(1)* %8, align 4
-  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
-  br i1 undef, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-; COMMON-LABEL: {{^}}branch_false:
-; SI: .text
-; SI-NEXT: s_endpgm
-define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
-entry:
-  br i1 false, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %entry
-  %add.ptr.sum = shl i32 %main_stride, 1
-  %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride
-  %add.ptr4.sum = shl i32 %main_stride, 2
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
-  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
-  %1 = load i32, i32 addrspace(1)* %0, align 4
-  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
-  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %3 = load i32, i32 addrspace(1)* %2, align 4
-  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
-  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  %5 = load i32, i32 addrspace(1)* %4, align 4
-  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
-  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
-  %7 = load i32, i32 addrspace(1)* %6, align 4
-  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
-  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
-  %9 = load i32, i32 addrspace(1)* %8, align 4
-  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
-  br i1 undef, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-; COMMON-LABEL: {{^}}branch_undef:
-; SI: .text
-; SI-NEXT: s_endpgm
-define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
-entry:
-  br i1 undef, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %entry
-  %add.ptr.sum = shl i32 %main_stride, 1
-  %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride
-  %add.ptr4.sum = shl i32 %main_stride, 2
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
-  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
-  %1 = load i32, i32 addrspace(1)* %0, align 4
-  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
-  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %3 = load i32, i32 addrspace(1)* %2, align 4
-  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
-  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  %5 = load i32, i32 addrspace(1)* %4, align 4
-  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
-  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
-  %7 = load i32, i32 addrspace(1)* %6, align 4
-  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
-  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
-  %9 = load i32, i32 addrspace(1)* %8, align 4
-  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
-  br i1 undef, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/R600/unroll.ll b/test/CodeGen/R600/unroll.ll
deleted file mode 100644
index 411a15a4b83..00000000000
--- a/test/CodeGen/R600/unroll.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: opt -mtriple=amdgcn-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s
-; RUN: opt -mtriple=r600-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s
-
-
-; This test contains a simple loop that initializes an array declared in
-; private memory.  We want to make sure these kinds of loops are always
-; unrolled, because private memory is slow.
-
-; CHECK-LABEL: @test
-; CHECK-NOT: alloca
-; CHECK: store i32 5, i32 addrspace(1)* %out
-define void @test(i32 addrspace(1)* %out) {
-entry:
-  %0 = alloca [32 x i32]
-  br label %loop.header
-
-loop.header:
-  %counter = phi i32 [0, %entry], [%inc, %loop.inc]
-  br label %loop.body
-
-loop.body:
-  %ptr = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 %counter
-  store i32 %counter, i32* %ptr
-  br label %loop.inc
-
-loop.inc:
-  %inc = add i32 %counter, 1
-  %1 = icmp sge i32 %counter, 32
-  br i1 %1, label  %exit, label %loop.header
-
-exit:
-  %2 = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 5
-  %3 = load i32, i32* %2
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/unsupported-cc.ll b/test/CodeGen/R600/unsupported-cc.ll
deleted file mode 100644
index 8ab4faf2f14..00000000000
--- a/test/CodeGen/R600/unsupported-cc.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; These tests are for condition codes that are not supported by the hardware
-
-; CHECK-LABEL: {{^}}slt:
-; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 5(7.006492e-45)
-define void @slt(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = icmp slt i32 %in, 5
-  %1 = select i1 %0, i32 -1, i32 0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}ult_i32:
-; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 5(7.006492e-45)
-define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = icmp ult i32 %in, 5
-  %1 = select i1 %0, i32 -1, i32 0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}ult_float:
-; CHECK: SETGE * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x
-; CHECK-NEXT: 1084227584(5.000000e+00)
-; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
-; CHECK-NEXT: LSHR *
-define void @ult_float(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp ult float %in, 5.0
-  %1 = select i1 %0, float 1.0, float 0.0
-  store float %1, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}ult_float_native:
-; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR *
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @ult_float_native(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp ult float %in, 5.0
-  %1 = select i1 %0, float 0.0, float 1.0
-  store float %1, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}olt:
-; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR *
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @olt(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp olt float %in, 5.0
-  %1 = select i1 %0, float 1.0, float 0.0
-  store float %1, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}sle:
-; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 6(8.407791e-45)
-define void @sle(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = icmp sle i32 %in, 5
-  %1 = select i1 %0, i32 -1, i32 0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}ule_i32:
-; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
-; CHECK-NEXT: 6(8.407791e-45)
-define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = icmp ule i32 %in, 5
-  %1 = select i1 %0, i32 -1, i32 0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}ule_float:
-; CHECK: SETGT * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x
-; CHECK-NEXT: 1084227584(5.000000e+00)
-; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
-; CHECK-NEXT: LSHR *
-define void @ule_float(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp ule float %in, 5.0
-  %1 = select i1 %0, float 1.0, float 0.0
-  store float %1, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}ule_float_native:
-; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR *
-; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @ule_float_native(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp ule float %in, 5.0
-  %1 = select i1 %0, float 0.0, float 1.0
-  store float %1, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}ole:
-; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR *
-; CHECK-NEXT:1084227584(5.000000e+00)
-define void @ole(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = fcmp ole float %in, 5.0
-  %1 = select i1 %0, float 1.0, float 0.0
-  store float %1, float addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/urecip.ll b/test/CodeGen/R600/urecip.ll
deleted file mode 100644
index daacc771708..00000000000
--- a/test/CodeGen/R600/urecip.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK: v_rcp_iflag_f32_e32
-
-define void @test(i32 %p, i32 %q) {
-   %i = udiv i32 %p, %q
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/urem.ll b/test/CodeGen/R600/urem.ll
deleted file mode 100644
index 62841ec2d6c..00000000000
--- a/test/CodeGen/R600/urem.ll
+++ /dev/null
@@ -1,94 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; The code generated by urem is long and complex and may frequently
-; change.  The goal of this test is to make sure the ISel doesn't fail
-; when it gets a v2i32/v4i32 urem
-
-; FUNC-LABEL: {{^}}test_urem_i32:
-; SI: s_endpgm
-; EG: CF_END
-define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
-  %result = urem i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_urem_i32_7:
-; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925
-; SI: v_mul_hi_u32 {{v[0-9]+}}, [[MAGIC]]
-; SI: v_subrev_i32
-; SI: v_mul_lo_i32
-; SI: v_sub_i32
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %num = load i32, i32 addrspace(1) * %in
-  %result = urem i32 %num, 7
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_urem_v2i32:
-; SI: s_endpgm
-; EG: CF_END
-define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
-  %result = urem <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_urem_v4i32:
-; SI: s_endpgm
-; EG: CF_END
-define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
-  %result = urem <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_urem_i64:
-; SI: s_endpgm
-; EG: CF_END
-define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
-  %a = load i64, i64 addrspace(1)* %in
-  %b = load i64, i64 addrspace(1)* %b_ptr
-  %result = urem i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_urem_v2i64:
-; SI: s_endpgm
-; EG: CF_END
-define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
-  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
-  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
-  %result = urem <2 x i64> %a, %b
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_urem_v4i64:
-; SI: s_endpgm
-; EG: CF_END
-define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
-  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
-  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
-  %result = urem <4 x i64> %a, %b
-  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/use-sgpr-multiple-times.ll b/test/CodeGen/R600/use-sgpr-multiple-times.ll
deleted file mode 100644
index f26f30022b4..00000000000
--- a/test/CodeGen/R600/use-sgpr-multiple-times.ll
+++ /dev/null
@@ -1,103 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
-
-declare float @llvm.fma.f32(float, float, float) #1
-declare float @llvm.fmuladd.f32(float, float, float) #1
-declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1
-
-
-; GCN-LABEL: {{^}}test_sgpr_use_twice_binop:
-; GCN: s_load_dword [[SGPR:s[0-9]+]],
-; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
-; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
-  %dbl = fadd float %a, %a
-  store float %dbl, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_sgpr_use_three_ternary_op:
-; GCN: s_load_dword [[SGPR:s[0-9]+]],
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
-; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
-  %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1
-  store float %fma, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
-; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
-; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
-  %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
-  store float %fma, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
-; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
-; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
-  %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
-  store float %fma, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
-; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
-; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
-  %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
-  store float %fma, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_imm:
-; GCN: s_load_dword [[SGPR:s[0-9]+]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
-; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
-  %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1
-  store float %fma, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
-; GCN: s_load_dword [[SGPR:s[0-9]+]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
-; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
-  %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
-  store float %fma, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; Don't use fma since fma c, x, y is canonicalized to fma x, c, y
-; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a:
-; GCN: s_load_dword [[SGPR:s[0-9]+]]
-; GCN: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]]
-; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32 %a) #0 {
-  %fma = call i32 @llvm.AMDGPU.imad24(i32 2, i32 %a, i32 %a) #1
-  store i32 %fma, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/usubo.ll b/test/CodeGen/R600/usubo.ll
deleted file mode 100644
index 3c9b1622a07..00000000000
--- a/test/CodeGen/R600/usubo.ll
+++ /dev/null
@@ -1,86 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
-declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
-
-; FUNC-LABEL: {{^}}usubo_i64_zext:
-
-; EG: SUBB_UINT
-; EG: ADDC_UINT
-define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
-  %val = extractvalue { i64, i1 } %usub, 0
-  %carry = extractvalue { i64, i1 } %usub, 1
-  %ext = zext i1 %carry to i64
-  %add2 = add i64 %val, %ext
-  store i64 %add2, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_usubo_i32:
-; SI: s_sub_i32
-
-; EG-DAG: SUBB_UINT
-; EG-DAG: SUB_INT
-define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
-  %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind
-  %val = extractvalue { i32, i1 } %usub, 0
-  %carry = extractvalue { i32, i1 } %usub, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_usubo_i32:
-; SI: v_subrev_i32_e32
-
-; EG-DAG: SUBB_UINT
-; EG-DAG: SUB_INT
-define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
-  %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind
-  %val = extractvalue { i32, i1 } %usub, 0
-  %carry = extractvalue { i32, i1 } %usub, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_usubo_i64:
-; SI: s_sub_u32
-; SI: s_subb_u32
-
-; EG-DAG: SUBB_UINT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
-; EG: SUB_INT
-define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
-  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
-  %val = extractvalue { i64, i1 } %usub, 0
-  %carry = extractvalue { i64, i1 } %usub, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_usubo_i64:
-; SI: v_sub_i32
-; SI: v_subb_u32
-
-; EG-DAG: SUBB_UINT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
-; EG: SUB_INT
-define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %a = load i64, i64 addrspace(1)* %aptr, align 4
-  %b = load i64, i64 addrspace(1)* %bptr, align 4
-  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
-  %val = extractvalue { i64, i1 } %usub, 0
-  %carry = extractvalue { i64, i1 } %usub, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
-  ret void
-}
diff --git a/test/CodeGen/R600/v1i64-kernel-arg.ll b/test/CodeGen/R600/v1i64-kernel-arg.ll
deleted file mode 100644
index 31755125c03..00000000000
--- a/test/CodeGen/R600/v1i64-kernel-arg.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; REQUIRES: asserts
-; XFAIL: *
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
-
-; CHECK-LABEL: {{^}}kernel_arg_i64:
-define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
-  store i64 %a, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; i64 arg works, v1i64 arg does not.
-; CHECK-LABEL: {{^}}kernel_arg_v1i64:
-define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
-  store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
-  ret void
-}
-
diff --git a/test/CodeGen/R600/v_cndmask.ll b/test/CodeGen/R600/v_cndmask.ll
deleted file mode 100644
index c368c5aaf7d..00000000000
--- a/test/CodeGen/R600/v_cndmask.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; SI-LABEL: {{^}}v_cnd_nan_nosgpr:
-; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}}
-; SI-DAG: v{{[0-9]}}
-; All nan values are converted to 0xffffffff
-; SI: s_endpgm
-define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
-  %idx = call i32 @llvm.r600.read.tidig.x() #1
-  %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
-  %f = load float, float addrspace(1)* %fptr
-  %setcc = icmp ne i32 %c, 0
-  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
-  store float %select, float addrspace(1)* %out
-  ret void
-}
-
-
-; This requires slightly trickier SGPR operand legalization since the
-; single constant bus SGPR usage is the last operand, and it should
-; never be moved.
-
-; SI-LABEL: {{^}}v_cnd_nan:
-; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}}
-; SI-DAG: v{{[0-9]}}
-; All nan values are converted to 0xffffffff
-; SI: s_endpgm
-define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
-  %setcc = icmp ne i32 %c, 0
-  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
-  store float %select, float addrspace(1)* %out
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/valu-i1.ll b/test/CodeGen/R600/valu-i1.ll
deleted file mode 100644
index 7d0ebd139f5..00000000000
--- a/test/CodeGen/R600/valu-i1.ll
+++ /dev/null
@@ -1,188 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; SI-LABEL: @test_if
-; Make sure the i1 values created by the cfg structurizer pass are
-; moved using VALU instructions
-; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
-; SI: v_mov_b32_e32 v{{[0-9]}}, -1
-define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
-entry:
-  switch i32 %a, label %default [
-    i32 0, label %case0
-    i32 1, label %case1
-  ]
-
-case0:
-  %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
-  store i32 0, i32 addrspace(1)* %arrayidx1, align 4
-  br label %end
-
-case1:
-  %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
-  store i32 1, i32 addrspace(1)* %arrayidx5, align 4
-  br label %end
-
-default:
-  %cmp8 = icmp eq i32 %a, 2
-  %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
-  br i1 %cmp8, label %if, label %else
-
-if:
-  store i32 2, i32 addrspace(1)* %arrayidx10, align 4
-  br label %end
-
-else:
-  store i32 3, i32 addrspace(1)* %arrayidx10, align 4
-  br label %end
-
-end:
-  ret void
-}
-
-; SI-LABEL: @simple_test_v_if
-; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
-; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
-
-; SI: ; BB#1
-; SI: buffer_store_dword
-; SI: s_endpgm
-
-; SI: BB1_2:
-; SI: s_or_b64 exec, exec, [[BR_SREG]]
-; SI: s_endpgm
-define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %is.0 = icmp ne i32 %tid, 0
-  br i1 %is.0, label %store, label %exit
-
-store:
-  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
-  store i32 999, i32 addrspace(1)* %gep
-  ret void
-
-exit:
-  ret void
-}
-
-; SI-LABEL: @simple_test_v_loop
-; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
-; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
-; SI: s_cbranch_execz BB2_2
-
-; SI: ; BB#1:
-; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-
-; SI: BB2_3:
-; SI: buffer_load_dword
-; SI: buffer_store_dword
-; SI: v_cmp_eq_i32_e32 vcc,
-; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]]
-; SI: s_andn2_b64 exec, exec, [[OR_SREG]]
-; SI: s_cbranch_execnz BB2_3
-
-define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
-entry:
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %is.0 = icmp ne i32 %tid, 0
-  %limit = add i32 %tid, 64
-  br i1 %is.0, label %loop, label %exit
-
-loop:
-  %i = phi i32 [%tid, %entry], [%i.inc, %loop]
-  %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i
-  %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i
-  %load = load i32, i32 addrspace(1)* %src
-  store i32 %load, i32 addrspace(1)* %gep.dst
-  %i.inc = add nsw i32 %i, 1
-  %cmp = icmp eq i32 %limit, %i.inc
-  br i1 %cmp, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-; SI-LABEL: @multi_vcond_loop
-
-; Load loop limit from buffer
-; Branch to exit if uniformly not taken
-; SI: ; BB#0:
-; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
-; SI: v_cmp_lt_i32_e32 vcc
-; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
-; SI: s_cbranch_execz BB3_2
-
-; Initialize inner condition to false
-; SI: ; BB#1:
-; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}}
-; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
-
-; Clear exec bits for workitems that load -1s
-; SI: BB3_3:
-; SI: buffer_load_dword [[B:v[0-9]+]]
-; SI: buffer_load_dword [[A:v[0-9]+]]
-; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
-; SI-DAG: v_cmp_ne_i32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
-; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
-; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]]
-; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]]
-; SI: s_cbranch_execz BB3_5
-
-; SI: BB#4:
-; SI: buffer_store_dword
-; SI: v_cmp_ge_i64_e32 vcc
-; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]
-
-; SI: BB3_5:
-; SI: s_or_b64 exec, exec, [[ORNEG1]]
-; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]]
-; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
-; SI: s_cbranch_execnz BB3_3
-
-; SI: BB#6
-; SI: s_or_b64 exec, exec, [[COND_STATE]]
-
-; SI: BB3_2:
-; SI-NOT: [[COND_STATE]]
-; SI: s_endpgm
-
-define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
-bb:
-  %tmp = tail call i32 @llvm.r600.read.tidig.x() #0
-  %tmp4 = sext i32 %tmp to i64
-  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4
-  %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
-  %tmp7 = icmp sgt i32 %tmp6, 0
-  %tmp8 = sext i32 %tmp6 to i64
-  br i1 %tmp7, label %bb10, label %bb26
-
-bb10:                                             ; preds = %bb, %bb20
-  %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
-  %tmp12 = add nsw i64 %tmp11, %tmp4
-  %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12
-  %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
-  %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12
-  %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
-  %tmp17 = icmp ne i32 %tmp14, -1
-  %tmp18 = icmp ne i32 %tmp16, -1
-  %tmp19 = and i1 %tmp17, %tmp18
-  br i1 %tmp19, label %bb20, label %bb26
-
-bb20:                                             ; preds = %bb10
-  %tmp21 = add nsw i32 %tmp16, %tmp14
-  %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12
-  store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
-  %tmp23 = add nuw nsw i64 %tmp11, 1
-  %tmp24 = icmp slt i64 %tmp23, %tmp8
-  br i1 %tmp24, label %bb10, label %bb26
-
-bb26:                                             ; preds = %bb10, %bb20, %bb
-  ret void
-}
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/vector-alloca.ll b/test/CodeGen/R600/vector-alloca.ll
deleted file mode 100644
index 6f3b4847fbd..00000000000
--- a/test/CodeGen/R600/vector-alloca.ll
+++ /dev/null
@@ -1,77 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}vector_read:
-; EG: MOV
-; EG: MOV
-; EG: MOV
-; EG: MOV
-; EG: MOVA_INT
-define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
-entry:
-  %0 = alloca [4 x i32]
-  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
-  store i32 0, i32* %x
-  store i32 1, i32* %y
-  store i32 2, i32* %z
-  store i32 3, i32* %w
-  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %index
-  %2 = load i32, i32* %1
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}vector_write:
-; EG: MOV
-; EG: MOV
-; EG: MOV
-; EG: MOV
-; EG: MOVA_INT
-; EG: MOVA_INT
-define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
-entry:
-  %0 = alloca [4 x i32]
-  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
-  store i32 0, i32* %x
-  store i32 0, i32* %y
-  store i32 0, i32* %z
-  store i32 0, i32* %w
-  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %w_index
-  store i32 1, i32* %1
-  %2 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %r_index
-  %3 = load i32, i32* %2
-  store i32 %3, i32 addrspace(1)* %out
-  ret void
-}
-
-; This test should be optimize to:
-; store i32 0, i32 addrspace(1)* %out
-; FUNC-LABEL: {{^}}bitcast_gep:
-; EG: STORE_RAW
-define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
-entry:
-  %0 = alloca [4 x i32]
-  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
-  store i32 0, i32* %x
-  store i32 0, i32* %y
-  store i32 0, i32* %z
-  store i32 0, i32* %w
-  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
-  %2 = bitcast i32* %1 to [4 x i32]*
-  %3 = getelementptr [4 x i32], [4 x i32]* %2, i32 0, i32 0
-  %4 = load i32, i32* %3
-  store i32 %4, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/vertex-fetch-encoding.ll b/test/CodeGen/R600/vertex-fetch-encoding.ll
deleted file mode 100644
index fb6a17e6714..00000000000
--- a/test/CodeGen/R600/vertex-fetch-encoding.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=barts | FileCheck --check-prefix=NI %s
-; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM %s
-
-; NI: {{^}}vtx_fetch32:
-; NI: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
-; CM: {{^}}vtx_fetch32:
-; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
-
-define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %0 = load i32, i32 addrspace(1)* %in
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; NI: {{^}}vtx_fetch128:
-; NI: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00
-; XXX: Add a case for Cayman when v4i32 stores are supported.
-
-define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-entry:
-  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in
-  store <4 x i32> %0, <4 x i32> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/vop-shrink.ll b/test/CodeGen/R600/vop-shrink.ll
deleted file mode 100644
index 9b2f229c05a..00000000000
--- a/test/CodeGen/R600/vop-shrink.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; Test that we correctly commute a sub instruction
-; FUNC-LABEL: {{^}}sub_rev:
-; SI-NOT: v_sub_i32_e32 v{{[0-9]+}}, s
-; SI: v_subrev_i32_e32 v{{[0-9]+}}, s
-
-; ModuleID = 'vop-shrink.ll'
-
-define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
-entry:
-  %vgpr = call i32 @llvm.r600.read.tidig.x() #1
-  %tmp = icmp eq i32 %cond, 0
-  br i1 %tmp, label %if, label %else
-
-if:                                               ; preds = %entry
-  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %tmp2 = extractelement <4 x i32> %sgpr, i32 1
-  store i32 %tmp2, i32 addrspace(1)* %out
-  br label %endif
-
-else:                                             ; preds = %entry
-  %tmp3 = extractelement <4 x i32> %sgpr, i32 2
-  %tmp4 = sub i32 %vgpr, %tmp3
-  store i32 %tmp4, i32 addrspace(1)* %out
-  br label %endif
-
-endif:                                            ; preds = %else, %if
-  ret void
-}
-
-; Test that we fold an immediate that was illegal for a 64-bit op into the
-; 32-bit op when we shrink it.
-
-; FUNC-LABEL: {{^}}add_fold:
-; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000
-define void @add_fold(float addrspace(1)* %out) {
-entry:
-  %tmp = call i32 @llvm.r600.read.tidig.x()
-  %tmp1 = uitofp i32 %tmp to float
-  %tmp2 = fadd float %tmp1, 1.024000e+03
-  store float %tmp2, float addrspace(1)* %out
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #0
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/vselect.ll b/test/CodeGen/R600/vselect.ll
deleted file mode 100644
index a3014b03d2b..00000000000
--- a/test/CodeGen/R600/vselect.ll
+++ /dev/null
@@ -1,77 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
-
-;EG: {{^}}test_select_v2i32:
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-;SI: {{^}}test_select_v2i32:
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e64
-
-define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
-entry:
-  %0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0
-  %1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1
-  %cmp = icmp ne <2 x i32> %0, %1
-  %result = select <2 x i1> %cmp, <2 x i32> %0, <2 x i32> %1
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-;EG: {{^}}test_select_v2f32:
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-;SI: {{^}}test_select_v2f32:
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e64
-
-define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
-entry:
-  %0 = load <2 x float>, <2 x float> addrspace(1)* %in0
-  %1 = load <2 x float>, <2 x float> addrspace(1)* %in1
-  %cmp = fcmp une <2 x float> %0, %1
-  %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1
-  store <2 x float> %result, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-;EG: {{^}}test_select_v4i32:
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-;SI: {{^}}test_select_v4i32:
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e64
-
-define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
-entry:
-  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0
-  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1
-  %cmp = icmp ne <4 x i32> %0, %1
-  %result = select <4 x i1> %cmp, <4 x i32> %0, <4 x i32> %1
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-;EG: {{^}}test_select_v4f32:
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
-entry:
-  %0 = load <4 x float>, <4 x float> addrspace(1)* %in0
-  %1 = load <4 x float>, <4 x float> addrspace(1)* %in1
-  %cmp = fcmp une <4 x float> %0, %1
-  %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/vselect64.ll b/test/CodeGen/R600/vselect64.ll
deleted file mode 100644
index ef85ebe7899..00000000000
--- a/test/CodeGen/R600/vselect64.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck  %s
-; XXX: Merge this test into vselect.ll once SI supports 64-bit select.
-
-; CHECK-LABEL: {{^}}test_select_v4i64:
-; Make sure the vectors aren't being stored on the stack.  We know they are
-; being stored on the stack if the shaders uses at leat 10 registers.
-; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X
-define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
-entry:
-       %cmp = icmp ne  <4 x i32> %c, <i32 0, i32 0, i32 0, i32 0>
-       %result = select <4 x i1> %cmp, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> <i64 4, i64 5, i64 6, i64 7>
-       store <4 x i64> %result, <4 x i64> addrspace(1)* %out
-       ret void
-}
-
diff --git a/test/CodeGen/R600/vtx-fetch-branch.ll b/test/CodeGen/R600/vtx-fetch-branch.ll
deleted file mode 100644
index 4584d6e2525..00000000000
--- a/test/CodeGen/R600/vtx-fetch-branch.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood %s -o - | FileCheck %s
-
-; This tests for a bug where vertex fetch clauses right before an ENDIF
-; instruction where being emitted after the ENDIF.  We were using ALU_POP_AFTER
-; for the ALU clause before the vetex fetch instead of emitting a POP instruction
-; after the fetch clause.
-
-
-; CHECK-LABEL: {{^}}test:
-; CHECK-NOT: ALU_POP_AFTER
-; CHECK: TEX
-; CHECK-NEXT: POP
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
-entry:
-  %0 = icmp eq i32 %cond, 0
-  br i1 %0, label %endif, label %if
-
-if:
-  %1 = load i32, i32 addrspace(1)* %in
-  br label %endif
-
-endif:
-  %x = phi i32 [ %1, %if], [ 0, %entry]
-  store i32 %x, i32 addrspace(1)* %out
-  br label %done
-
-done:
-  ret void
-}
diff --git a/test/CodeGen/R600/vtx-schedule.ll b/test/CodeGen/R600/vtx-schedule.ll
deleted file mode 100644
index 912e258ebb8..00000000000
--- a/test/CodeGen/R600/vtx-schedule.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; This test is for a scheduler bug where VTX_READ instructions that used
-; the result of another VTX_READ instruction were being grouped in the
-; same fetch clasue.
-
-; CHECK: {{^}}test:
-; CHECK: Fetch clause
-; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0
-; CHECK: Fetch clause
-; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0
-define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
-entry:
-  %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0
-  %1 = load i32, i32 addrspace(1)* %0
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/wait.ll b/test/CodeGen/R600/wait.ll
deleted file mode 100644
index 5cc7577cad3..00000000000
--- a/test/CodeGen/R600/wait.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s
-
-; CHECK-LABEL: {{^}}main:
-; CHECK: s_load_dwordx4
-; CHECK: s_load_dwordx4
-; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; CHECK: s_endpgm
-define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
-main_body:
-  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0
-  %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
-  %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6)
-  %tmp12 = extractelement <4 x float> %tmp11, i32 0
-  %tmp13 = extractelement <4 x float> %tmp11, i32 1
-  call void @llvm.AMDGPU.barrier.global() #1
-  %tmp14 = extractelement <4 x float> %tmp11, i32 2
-;  %tmp15 = extractelement <4 x float> %tmp11, i32 3
-  %tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt
-  %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1
-  %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0
-  %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6)
-  %tmp19 = extractelement <4 x float> %tmp18, i32 0
-  %tmp20 = extractelement <4 x float> %tmp18, i32 1
-  %tmp21 = extractelement <4 x float> %tmp18, i32 2
-  %tmp22 = extractelement <4 x float> %tmp18, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15)
-  ret void
-}
-
-; Function Attrs: noduplicate nounwind
-declare void @llvm.AMDGPU.barrier.global() #1
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { noduplicate nounwind }
-attributes #2 = { nounwind readnone }
-
-!0 = !{!1, !1, i64 0, i32 1}
-!1 = !{!"const", null}
diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll
deleted file mode 100644
index 4328e964c1b..00000000000
--- a/test/CodeGen/R600/work-item-intrinsics.ll
+++ /dev/null
@@ -1,238 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-; FUNC-LABEL: {{^}}ngroups_x:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].X
-
-; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @ngroups_x (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.ngroups.x() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}ngroups_y:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].Y
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @ngroups_y (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.ngroups.y() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}ngroups_z:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].Z
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @ngroups_z (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.ngroups.z() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}global_size_x:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].W
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @global_size_x (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.global.size.x() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}global_size_y:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].X
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @global_size_y (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.global.size.y() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}global_size_z:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].Y
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @global_size_z (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.global.size.z() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_x:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].Z
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_x (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.local.size.x() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_y:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].W
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_y (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.local.size.y() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_z:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].X
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_z (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.local.size.z() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}get_work_dim:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].Z
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @get_work_dim (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.AMDGPU.read.workdim() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; The tgid values are stored in sgprs offset by the number of user sgprs.
-; Currently we always use exactly 2 user sgprs for the pointer to the
-; kernel arguments, but this may change in the future.
-
-; FUNC-LABEL: {{^}}tgid_x:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4
-; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_x (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.tgid.x() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}tgid_y:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5
-; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_y (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.tgid.y() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}tgid_z:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6
-; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_z (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.tgid.z() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}tidig_x:
-; GCN: buffer_store_dword v0
-define void @tidig_x (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.tidig.x() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}tidig_y:
-; GCN: buffer_store_dword v1
-define void @tidig_y (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.tidig.y() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}tidig_z:
-; GCN: buffer_store_dword v2
-define void @tidig_z (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.tidig.z() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-declare i32 @llvm.r600.read.ngroups.x() #0
-declare i32 @llvm.r600.read.ngroups.y() #0
-declare i32 @llvm.r600.read.ngroups.z() #0
-
-declare i32 @llvm.r600.read.global.size.x() #0
-declare i32 @llvm.r600.read.global.size.y() #0
-declare i32 @llvm.r600.read.global.size.z() #0
-
-declare i32 @llvm.r600.read.local.size.x() #0
-declare i32 @llvm.r600.read.local.size.y() #0
-declare i32 @llvm.r600.read.local.size.z() #0
-
-declare i32 @llvm.r600.read.tgid.x() #0
-declare i32 @llvm.r600.read.tgid.y() #0
-declare i32 @llvm.r600.read.tgid.z() #0
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare i32 @llvm.r600.read.tidig.y() #0
-declare i32 @llvm.r600.read.tidig.z() #0
-
-declare i32 @llvm.AMDGPU.read.workdim() #0
-
-attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/wrong-transalu-pos-fix.ll b/test/CodeGen/R600/wrong-transalu-pos-fix.ll
deleted file mode 100644
index 8b383e4c393..00000000000
--- a/test/CodeGen/R600/wrong-transalu-pos-fix.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood -mtriple=r600-- < %s | FileCheck %s
-
-; We want all MULLO_INT inst to be last in their instruction group
-;CHECK: {{^}}fill3d:
-;CHECK-NOT: MULLO_INT T[0-9]+
-
-define void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
-entry:
-  %x.i = tail call i32 @llvm.r600.read.global.size.x() #1
-  %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1
-  %mul = mul i32 %y.i18, %x.i
-  %z.i17 = tail call i32 @llvm.r600.read.global.size.z() #1
-  %mul3 = mul i32 %mul, %z.i17
-  %x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1
-  %x.i12.i = tail call i32 @llvm.r600.read.local.size.x() #1
-  %mul26.i = mul i32 %x.i12.i, %x.i.i
-  %x.i4.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %add.i16 = add i32 %x.i4.i, %mul26.i
-  %mul7 = mul i32 %add.i16, %y.i18
-  %y.i.i = tail call i32 @llvm.r600.read.tgid.y() #1
-  %y.i14.i = tail call i32 @llvm.r600.read.local.size.y() #1
-  %mul30.i = mul i32 %y.i14.i, %y.i.i
-  %y.i6.i = tail call i32 @llvm.r600.read.tidig.y() #1
-  %add.i14 = add i32 %mul30.i, %mul7
-  %mul819 = add i32 %add.i14, %y.i6.i
-  %add = mul i32 %mul819, %z.i17
-  %z.i.i = tail call i32 @llvm.r600.read.tgid.z() #1
-  %z.i16.i = tail call i32 @llvm.r600.read.local.size.z() #1
-  %mul33.i = mul i32 %z.i16.i, %z.i.i
-  %z.i8.i = tail call i32 @llvm.r600.read.tidig.z() #1
-  %add.i = add i32 %z.i8.i, %mul33.i
-  %add13 = add i32 %add.i, %add
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add13
-  store i32 %mul3, i32 addrspace(1)* %arrayidx, align 4
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.z() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.local.size.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.local.size.y() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.local.size.z() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.z() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.global.size.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.global.size.y() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.global.size.z() #1
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
-!opencl.kernels = !{!0, !1, !2}
-
-!0 = !{null}
-!1 = !{null}
-!2 = !{void (i32 addrspace(1)*)* @fill3d}
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
deleted file mode 100644
index 089db59eabc..00000000000
--- a/test/CodeGen/R600/xor.ll
+++ /dev/null
@@ -1,173 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-; FUNC-LABEL: {{^}}xor_v2i32:
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0
-  %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1
-  %result = xor <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}xor_v4i32:
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
-define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0
-  %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1
-  %result = xor <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}xor_i1:
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
-
-; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
-; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}
-; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
-  %a = load float, float addrspace(1) * %in0
-  %b = load float, float addrspace(1) * %in1
-  %acmp = fcmp oge float %a, 0.000000e+00
-  %bcmp = fcmp oge float %b, 1.000000e+00
-  %xor = xor i1 %acmp, %bcmp
-  %result = select i1 %xor, float %a, float %b
-  store float %result, float addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_xor_i1:
-; SI: buffer_load_ubyte [[B:v[0-9]+]]
-; SI: buffer_load_ubyte [[A:v[0-9]+]]
-; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]]
-; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
-; SI: buffer_store_byte [[RESULT]]
-define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
-  %a = load i1, i1 addrspace(1)* %in0
-  %b = load i1, i1 addrspace(1)* %in1
-  %xor = xor i1 %a, %b
-  store i1 %xor, i1 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}vector_xor_i32:
-; SI: v_xor_b32_e32
-define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
-  %a = load i32, i32 addrspace(1)* %in0
-  %b = load i32, i32 addrspace(1)* %in1
-  %result = xor i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}scalar_xor_i32:
-; SI: s_xor_b32
-define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-  %result = xor i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}scalar_not_i32:
-; SI: s_not_b32
-define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
-  %result = xor i32 %a, -1
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}vector_not_i32:
-; SI: v_not_b32
-define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
-  %a = load i32, i32 addrspace(1)* %in0
-  %b = load i32, i32 addrspace(1)* %in1
-  %result = xor i32 %a, -1
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}vector_xor_i64:
-; SI: v_xor_b32_e32
-; SI: v_xor_b32_e32
-; SI: s_endpgm
-define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
-  %a = load i64, i64 addrspace(1)* %in0
-  %b = load i64, i64 addrspace(1)* %in1
-  %result = xor i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}scalar_xor_i64:
-; SI: s_xor_b64
-; SI: s_endpgm
-define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
-  %result = xor i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}scalar_not_i64:
-; SI: s_not_b64
-define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
-  %result = xor i64 %a, -1
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}vector_not_i64:
-; SI: v_not_b32
-; SI: v_not_b32
-define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
-  %a = load i64, i64 addrspace(1)* %in0
-  %b = load i64, i64 addrspace(1)* %in1
-  %result = xor i64 %a, -1
-  store i64 %result, i64 addrspace(1)* %out
-  ret void
-}
-
-; Test that we have a pattern to match xor inside a branch.
-; Note that in the future the backend may be smart enough to
-; use an SALU instruction for this.
-
-; FUNC-LABEL: {{^}}xor_cf:
-; SI: s_xor_b64
-define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
-entry:
-  %0 = icmp eq i64 %a, 0
-  br i1 %0, label %if, label %else
-
-if:
-  %1 = xor i64 %a, %b
-  br label %endif
-
-else:
-  %2 = load i64, i64 addrspace(1)* %in
-  br label %endif
-
-endif:
-  %3 = phi i64 [%1, %if], [%2, %else]
-  store i64 %3, i64 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
deleted file mode 100644
index 033055db185..00000000000
--- a/test/CodeGen/R600/zero_extend.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
-
-; R600: {{^}}test:
-; R600: MEM_RAT_CACHELESS STORE_RAW
-; R600: MEM_RAT_CACHELESS STORE_RAW
-
-; SI: {{^}}test:
-; SI: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}}
-; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
-; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
-define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
-entry:
-  %0 = mul i32 %a, %b
-  %1 = add i32 %0, %c
-  %2 = zext i32 %1 to i64
-  store i64 %2, i64 addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}testi1toi32:
-; SI: v_cndmask_b32
-define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %0 = icmp eq i32 %a, %b
-  %1 = zext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}zext_i1_to_i64:
-; SI: s_mov_b32 s{{[0-9]+}}, 0
-; SI: v_cmp_eq_i32
-; SI: v_cndmask_b32
-define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %cmp = icmp eq i32 %a, %b
-  %ext = zext i1 %cmp to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/MC/AMDGPU/ds-err.s b/test/MC/AMDGPU/ds-err.s
new file mode 100644
index 00000000000..52c2740bec2
--- /dev/null
+++ b/test/MC/AMDGPU/ds-err.s
@@ -0,0 +1,23 @@
+// RUN: not llvm-mc -arch=amdgcn %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=SI %s 2>&1 | FileCheck %s
+
+// offset too big
+// CHECK: invalid operand for instruction
+ds_add_u32 v2, v4 offset:1000000000
+
+// offset0 twice
+// CHECK:  error: not a valid operand.
+ds_write2_b32 v2, v4, v6 offset0:4 offset0:8
+
+// offset1 twice
+// CHECK:  error: not a valid operand.
+ds_write2_b32 v2, v4, v6 offset1:4 offset1:8
+
+// offset0 too big
+// CHECK: invalid operand for instruction
+ds_write2_b32 v2, v4, v6 offset0:1000000000
+
+// offset1 too big
+// CHECK: invalid operand for instruction
+ds_write2_b32 v2, v4, v6 offset1:1000000000
+
diff --git a/test/MC/AMDGPU/ds.s b/test/MC/AMDGPU/ds.s
new file mode 100644
index 00000000000..ad63229ba2e
--- /dev/null
+++ b/test/MC/AMDGPU/ds.s
@@ -0,0 +1,337 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=SI  -show-encoding %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Checks for 16-bit Offsets
+//===----------------------------------------------------------------------===//
+
+ds_add_u32 v2, v4 offset:16
+// CHECK: ds_add_u32 v2, v4 offset:16 ; encoding: [0x10,0x00,0x00,0xd8,0x02,0x04,0x00,0x00]
+
+//===----------------------------------------------------------------------===//
+// Checks for 2 8-bit Offsets
+//===----------------------------------------------------------------------===//
+
+ds_write2_b32 v2, v4, v6 offset0:4
+// CHECK: ds_write2_b32 v2, v4, v6 offset0:4 ; encoding: [0x04,0x00,0x38,0xd8,0x02,0x04,0x06,0x00]
+
+ds_write2_b32 v2, v4, v6 offset0:4 offset1:8
+// CHECK: ds_write2_b32 v2, v4, v6 offset0:4 offset1:8 ; encoding: [0x04,0x08,0x38,0xd8,0x02,0x04,0x06,0x00]
+
+ds_write2_b32 v2, v4, v6 offset1:8
+// CHECK: ds_write2_b32 v2, v4, v6 offset1:8 ; encoding: [0x00,0x08,0x38,0xd8,0x02,0x04,0x06,0x00]
+
+ds_read2_b32 v[8:9], v2 offset0:4
+// CHECK: ds_read2_b32 v[8:9], v2 offset0:4 ; encoding: [0x04,0x00,0xdc,0xd8,0x02,0x00,0x00,0x08]
+
+ds_read2_b32 v[8:9], v2 offset0:4 offset1:8
+// CHECK: ds_read2_b32 v[8:9], v2 offset0:4 offset1:8 ; encoding: [0x04,0x08,0xdc,0xd8,0x02,0x00,0x00,0x08]
+
+ds_read2_b32 v[8:9], v2 offset1:8
+// CHECK: ds_read2_b32 v[8:9], v2 offset1:8 ; encoding: [0x00,0x08,0xdc,0xd8,0x02,0x00,0x00,0x08]
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+ds_add_u32 v2, v4
+// CHECK: ds_add_u32 v2, v4 ; encoding: [0x00,0x00,0x00,0xd8,0x02,0x04,0x00,0x00]
+
+ds_sub_u32 v2, v4
+// CHECK: ds_sub_u32 v2, v4 ; encoding: [0x00,0x00,0x04,0xd8,0x02,0x04,0x00,0x00]
+
+ds_rsub_u32 v2, v4
+// CHECK: ds_rsub_u32 v2, v4 ; encoding: [0x00,0x00,0x08,0xd8,0x02,0x04,0x00,0x00]
+
+ds_inc_u32 v2, v4
+// CHECK: ds_inc_u32 v2, v4 ; encoding: [0x00,0x00,0x0c,0xd8,0x02,0x04,0x00,0x00]
+
+ds_dec_u32 v2, v4
+// CHECK: ds_dec_u32 v2, v4 ; encoding: [0x00,0x00,0x10,0xd8,0x02,0x04,0x00,0x00]
+
+ds_min_i32 v2, v4
+// CHECK: ds_min_i32 v2, v4 ; encoding: [0x00,0x00,0x14,0xd8,0x02,0x04,0x00,0x00]
+
+ds_max_i32 v2, v4
+// CHECK: ds_max_i32 v2, v4 ; encoding: [0x00,0x00,0x18,0xd8,0x02,0x04,0x00,0x00]
+
+ds_min_u32 v2, v4
+// CHECK: ds_min_u32 v2, v4 ; encoding: [0x00,0x00,0x1c,0xd8,0x02,0x04,0x00,0x00]
+
+ds_max_u32 v2, v4
+// CHECK: ds_max_u32 v2, v4 ; encoding: [0x00,0x00,0x20,0xd8,0x02,0x04,0x00,0x00]
+
+ds_and_b32 v2, v4
+// CHECK: ds_and_b32 v2, v4 ; encoding: [0x00,0x00,0x24,0xd8,0x02,0x04,0x00,0x00]
+
+ds_or_b32 v2, v4
+// CHECK: ds_or_b32 v2, v4 ; encoding: [0x00,0x00,0x28,0xd8,0x02,0x04,0x00,0x00]
+
+ds_xor_b32 v2, v4
+// CHECK: ds_xor_b32 v2, v4 ; encoding: [0x00,0x00,0x2c,0xd8,0x02,0x04,0x00,0x00]
+
+ds_mskor_b32 v2, v4, v6
+// CHECK: ds_mskor_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x30,0xd8,0x02,0x04,0x06,0x00]
+
+ds_write_b32 v2, v4
+// CHECK: ds_write_b32 v2, v4 ; encoding: [0x00,0x00,0x34,0xd8,0x02,0x04,0x00,0x00]
+
+ds_write2_b32 v2, v4, v6
+// CHECK: ds_write2_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x38,0xd8,0x02,0x04,0x06,0x00]
+
+ds_write2st64_b32 v2, v4, v6
+// CHECK: ds_write2st64_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x3c,0xd8,0x02,0x04,0x06,0x00]
+
+ds_cmpst_b32 v2, v4, v6
+// CHECK: ds_cmpst_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x40,0xd8,0x02,0x04,0x06,0x00]
+
+ds_cmpst_f32 v2, v4, v6
+// CHECK: ds_cmpst_f32 v2, v4, v6 ; encoding: [0x00,0x00,0x44,0xd8,0x02,0x04,0x06,0x00]
+
+ds_min_f32 v2, v4, v6
+// CHECK: ds_min_f32 v2, v4, v6 ; encoding: [0x00,0x00,0x48,0xd8,0x02,0x04,0x06,0x00]
+
+ds_max_f32 v2, v4, v6
+// CHECK: ds_max_f32 v2, v4, v6 ; encoding: [0x00,0x00,0x4c,0xd8,0x02,0x04,0x06,0x00]
+
+ds_gws_init v2 gds
+// CHECK: ds_gws_init v2 gds ; encoding: [0x00,0x00,0x66,0xd8,0x02,0x00,0x00,0x00]
+
+ds_gws_sema_v v2 gds
+// CHECK: ds_gws_sema_v v2 gds ; encoding: [0x00,0x00,0x6a,0xd8,0x02,0x00,0x00,0x00]
+
+ds_gws_sema_br v2 gds
+// CHECK: ds_gws_sema_br v2 gds ; encoding: [0x00,0x00,0x6e,0xd8,0x02,0x00,0x00,0x00]
+
+ds_gws_sema_p v2 gds
+// CHECK: ds_gws_sema_p v2 gds ; encoding: [0x00,0x00,0x72,0xd8,0x02,0x00,0x00,0x00]
+
+ds_gws_barrier v2 gds
+// CHECK: ds_gws_barrier v2 gds ; encoding: [0x00,0x00,0x76,0xd8,0x02,0x00,0x00,0x00]
+
+ds_write_b8 v2, v4
+// CHECK: ds_write_b8 v2, v4 ; encoding: [0x00,0x00,0x78,0xd8,0x02,0x04,0x00,0x00]
+
+ds_write_b16 v2, v4
+// CHECK: ds_write_b16 v2, v4 ; encoding: [0x00,0x00,0x7c,0xd8,0x02,0x04,0x00,0x00]
+
+ds_add_rtn_u32 v8, v2, v4
+// CHECK: ds_add_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x80,0xd8,0x02,0x04,0x00,0x08]
+
+ds_sub_rtn_u32 v8, v2, v4
+// CHECK: ds_sub_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x84,0xd8,0x02,0x04,0x00,0x08]
+
+ds_rsub_rtn_u32 v8, v2, v4
+// CHECK: ds_rsub_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x88,0xd8,0x02,0x04,0x00,0x08]
+
+ds_inc_rtn_u32 v8, v2, v4
+// CHECK: ds_inc_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x8c,0xd8,0x02,0x04,0x00,0x08]
+
+ds_dec_rtn_u32 v8, v2, v4
+// CHECK: ds_dec_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x90,0xd8,0x02,0x04,0x00,0x08]
+
+ds_min_rtn_i32 v8, v2, v4
+// CHECK: ds_min_rtn_i32 v8, v2, v4 ; encoding: [0x00,0x00,0x94,0xd8,0x02,0x04,0x00,0x08]
+
+ds_max_rtn_i32 v8, v2, v4
+// CHECK: ds_max_rtn_i32 v8, v2, v4 ; encoding: [0x00,0x00,0x98,0xd8,0x02,0x04,0x00,0x08]
+
+ds_min_rtn_u32 v8, v2, v4
+// CHECK: ds_min_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x9c,0xd8,0x02,0x04,0x00,0x08]
+
+ds_max_rtn_u32 v8, v2, v4
+// CHECK: ds_max_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0xa0,0xd8,0x02,0x04,0x00,0x08]
+
+ds_and_rtn_b32 v8, v2, v4
+// CHECK: ds_and_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xa4,0xd8,0x02,0x04,0x00,0x08]
+
+ds_or_rtn_b32 v8, v2, v4
+// CHECK: ds_or_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xa8,0xd8,0x02,0x04,0x00,0x08]
+
+ds_xor_rtn_b32 v8, v2, v4
+// CHECK: ds_xor_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xac,0xd8,0x02,0x04,0x00,0x08]
+
+ds_mskor_rtn_b32 v8, v2, v4, v6
+// CHECK: ds_mskor_rtn_b32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xb0,0xd8,0x02,0x04,0x06,0x08]
+
+ds_wrxchg_rtn_b32 v8, v2, v4
+// CHECK: ds_wrxchg_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xb4,0xd8,0x02,0x04,0x00,0x08]
+
+ds_wrxchg2_rtn_b32 v[8:9], v2, v4, v6
+// CHECK: ds_wrxchg2_rtn_b32 v[8:9], v2, v4, v6 ; encoding: [0x00,0x00,0xb8,0xd8,0x02,0x04,0x06,0x08]
+
+ds_wrxchg2st64_rtn_b32 v[8:9] v2, v4, v6
+// CHECK: ds_wrxchg2st64_rtn_b32 v[8:9], v2, v4, v6 ; encoding: [0x00,0x00,0xbc,0xd8,0x02,0x04,0x06,0x08]
+
+ds_cmpst_rtn_b32 v8, v2, v4, v6
+// CHECK: ds_cmpst_rtn_b32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xc0,0xd8,0x02,0x04,0x06,0x08]
+
+ds_cmpst_rtn_f32 v8, v2, v4, v6
+// CHECK: ds_cmpst_rtn_f32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xc4,0xd8,0x02,0x04,0x06,0x08]
+
+ds_min_rtn_f32 v8, v2, v4, v6
+// CHECK: ds_min_rtn_f32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xc8,0xd8,0x02,0x04,0x06,0x08]
+
+ds_max_rtn_f32 v8, v2, v4, v6
+// CHECK: ds_max_rtn_f32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xcc,0xd8,0x02,0x04,0x06,0x08]
+
+ds_swizzle_b32 v8, v2
+// CHECK: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
+
+ds_read_b32 v8, v2
+// CHECK: ds_read_b32 v8, v2 ; encoding: [0x00,0x00,0xd8,0xd8,0x02,0x00,0x00,0x08]
+
+ds_read2_b32 v[8:9], v2
+// CHECK: ds_read2_b32 v[8:9], v2 ; encoding: [0x00,0x00,0xdc,0xd8,0x02,0x00,0x00,0x08]
+
+ds_read2st64_b32 v[8:9], v2
+// CHECK: ds_read2st64_b32 v[8:9], v2 ; encoding: [0x00,0x00,0xe0,0xd8,0x02,0x00,0x00,0x08]
+
+ds_read_i8 v8, v2
+// CHECK: ds_read_i8 v8, v2 ; encoding: [0x00,0x00,0xe4,0xd8,0x02,0x00,0x00,0x08]
+
+ds_read_u8 v8, v2
+// CHECK: ds_read_u8 v8, v2 ; encoding: [0x00,0x00,0xe8,0xd8,0x02,0x00,0x00,0x08]
+
+ds_read_i16 v8, v2
+// CHECK: ds_read_i16 v8, v2 ; encoding: [0x00,0x00,0xec,0xd8,0x02,0x00,0x00,0x08]
+
+ds_read_u16 v8, v2
+// CHECK: ds_read_u16 v8, v2 ; encoding: [0x00,0x00,0xf0,0xd8,0x02,0x00,0x00,0x08]
+
+ds_consume v8
+// CHECK: ds_consume v8 ; encoding: [0x00,0x00,0xf4,0xd8,0x00,0x00,0x00,0x08]
+
+ds_append v8
+// CHECK: ds_append v8 ; encoding: [0x00,0x00,0xf8,0xd8,0x00,0x00,0x00,0x08]
+
+ds_ordered_count v8, v2 gds
+// CHECK: ds_ordered_count v8, v2 gds ; encoding: [0x00,0x00,0xfe,0xd8,0x02,0x00,0x00,0x08]
+
+ds_add_u64 v2, v[4:5]
+// CHECK: ds_add_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x00,0xd9,0x02,0x04,0x00,0x00]
+
+ds_sub_u64 v2, v[4:5]
+// CHECK: ds_sub_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x04,0xd9,0x02,0x04,0x00,0x00]
+
+ds_rsub_u64 v2, v[4:5]
+// CHECK: ds_rsub_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x08,0xd9,0x02,0x04,0x00,0x00]
+
+ds_inc_u64 v2, v[4:5]
+// CHECK: ds_inc_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x0c,0xd9,0x02,0x04,0x00,0x00]
+
+ds_dec_u64 v2, v[4:5]
+// CHECK: ds_dec_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x10,0xd9,0x02,0x04,0x00,0x00]
+
+ds_min_i64 v2, v[4:5]
+// CHECK: ds_min_i64 v2, v[4:5] ; encoding: [0x00,0x00,0x14,0xd9,0x02,0x04,0x00,0x00]
+
+ds_max_i64 v2, v[4:5]
+// CHECK: ds_max_i64 v2, v[4:5] ; encoding: [0x00,0x00,0x18,0xd9,0x02,0x04,0x00,0x00]
+
+ds_min_u64 v2, v[4:5]
+// CHECK: ds_min_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x1c,0xd9,0x02,0x04,0x00,0x00]
+
+ds_max_u64 v2, v[4:5]
+// CHECK: ds_max_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x20,0xd9,0x02,0x04,0x00,0x00]
+
+ds_and_b64 v2, v[4:5]
+// CHECK: ds_and_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x24,0xd9,0x02,0x04,0x00,0x00]
+
+ds_or_b64 v2, v[4:5]
+// CHECK: ds_or_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x28,0xd9,0x02,0x04,0x00,0x00]
+
+ds_xor_b64 v2, v[4:5]
+// CHECK: ds_xor_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x2c,0xd9,0x02,0x04,0x00,0x00]
+
+ds_mskor_b64 v2, v[4:5], v[6:7]
+// CHECK: ds_mskor_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x30,0xd9,0x02,0x04,0x06,0x00]
+
+ds_write_b64 v2, v[4:5]
+// CHECK: ds_write_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x34,0xd9,0x02,0x04,0x00,0x00]
+
+ds_write2_b64 v2, v[4:5], v[6:7]
+// CHECK: ds_write2_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x38,0xd9,0x02,0x04,0x06,0x00]
+
+ds_write2st64_b64 v2, v[4:5], v[6:7]
+// CHECK: ds_write2st64_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x3c,0xd9,0x02,0x04,0x06,0x00]
+
+ds_cmpst_b64 v2, v[4:5], v[6:7]
+// CHECK: ds_cmpst_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x40,0xd9,0x02,0x04,0x06,0x00]
+
+ds_cmpst_f64 v2, v[4:5], v[6:7]
+// CHECK: ds_cmpst_f64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x44,0xd9,0x02,0x04,0x06,0x00]
+
+ds_min_f64 v2, v[4:5]
+// CHECK: ds_min_f64 v2, v[4:5] ; encoding: [0x00,0x00,0x48,0xd9,0x02,0x04,0x00,0x00]
+
+ds_max_f64 v2, v[4:5]
+// CHECK: ds_max_f64 v2, v[4:5] ; encoding: [0x00,0x00,0x4c,0xd9,0x02,0x04,0x00,0x00]
+
+ds_add_rtn_u64 v[8:9], v2, v[4:5]
+// CHECK: ds_add_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x80,0xd9,0x02,0x04,0x00,0x08]
+
+ds_sub_rtn_u64 v[8:9], v2, v[4:5]
+// CHECK: ds_sub_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x84,0xd9,0x02,0x04,0x00,0x08]
+
+ds_rsub_rtn_u64 v[8:9], v2, v[4:5]
+// CHECK: ds_rsub_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x88,0xd9,0x02,0x04,0x00,0x08]
+
+ds_inc_rtn_u64 v[8:9], v2, v[4:5]
+// CHECK: ds_inc_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x8c,0xd9,0x02,0x04,0x00,0x08]
+
+ds_dec_rtn_u64 v[8:9] v2, v[4:5]
+// CHECK: ds_dec_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x90,0xd9,0x02,0x04,0x00,0x08]
+
+ds_min_rtn_i64 v[8:9], v2, v[4:5]
+// CHECK: ds_min_rtn_i64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x94,0xd9,0x02,0x04,0x00,0x08]
+
+ds_max_rtn_i64 v[8:9], v2, v[4:5]
+// CHECK: ds_max_rtn_i64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x98,0xd9,0x02,0x04,0x00,0x08]
+
+ds_min_rtn_u64 v[8:9], v2, v[4:5]
+// CHECK: ds_min_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x9c,0xd9,0x02,0x04,0x00,0x08]
+
+ds_max_rtn_u64 v[8:9], v2, v[4:5]
+// CHECK: ds_max_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xa0,0xd9,0x02,0x04,0x00,0x08]
+
+ds_and_rtn_b64 v[8:9], v2, v[4:5]
+// CHECK: ds_and_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xa4,0xd9,0x02,0x04,0x00,0x08]
+
+ds_or_rtn_b64 v[8:9], v2, v[4:5]
+// CHECK: ds_or_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xa8,0xd9,0x02,0x04,0x00,0x08]
+
+ds_xor_rtn_b64 v[8:9], v2, v[4:5]
+// CHECK: ds_xor_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xac,0xd9,0x02,0x04,0x00,0x08]
+
+ds_mskor_rtn_b64 v[8:9], v2, v[4:5], v[6:7]
+// CHECK: ds_mskor_rtn_b64 v[8:9], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xb0,0xd9,0x02,0x04,0x06,0x08]
+
+ds_wrxchg_rtn_b64 v[8:9], v2, v[4:5]
+// CHECK: ds_wrxchg_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xb4,0xd9,0x02,0x04,0x00,0x08]
+
+ds_wrxchg2_rtn_b64 v[8:11], v2, v[4:5], v[6:7]
+// CHECK: ds_wrxchg2_rtn_b64 v[8:11], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xb8,0xd9,0x02,0x04,0x06,0x08]
+
+ds_wrxchg2st64_rtn_b64 v[8:11], v2, v[4:5], v[6:7]
+// CHECK: ds_wrxchg2st64_rtn_b64 v[8:11], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xbc,0xd9,0x02,0x04,0x06,0x08]
+
+ds_cmpst_rtn_b64 v[8:9], v2, v[4:5], v[6:7]
+// CHECK: ds_cmpst_rtn_b64 v[8:9], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xc0,0xd9,0x02,0x04,0x06,0x08]
+
+ds_cmpst_rtn_f64 v[8:9], v2, v[4:5], v[6:7]
+// CHECK: ds_cmpst_rtn_f64 v[8:9], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xc4,0xd9,0x02,0x04,0x06,0x08]
+
+ds_min_rtn_f64 v[8:9], v2, v[4:5]
+// CHECK: ds_min_rtn_f64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xc8,0xd9,0x02,0x04,0x00,0x08]
+
+ds_max_rtn_f64 v[8:9], v2, v[4:5]
+// CHECK: ds_max_rtn_f64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xcc,0xd9,0x02,0x04,0x00,0x08]
+
+ds_read_b64 v[8:9], v2
+// CHECK: ds_read_b64 v[8:9], v2 ; encoding: [0x00,0x00,0xd8,0xd9,0x02,0x00,0x00,0x08]
+
+ds_read2_b64 v[8:11], v2
+// CHECK: ds_read2_b64 v[8:11], v2 ; encoding: [0x00,0x00,0xdc,0xd9,0x02,0x00,0x00,0x08]
+
+ds_read2st64_b64 v[8:11], v2
+// CHECK: ds_read2st64_b64 v[8:11], v2 ; encoding: [0x00,0x00,0xe0,0xd9,0x02,0x00,0x00,0x08]
diff --git a/test/MC/AMDGPU/flat.s b/test/MC/AMDGPU/flat.s
new file mode 100644
index 00000000000..adad29a5595
--- /dev/null
+++ b/test/MC/AMDGPU/flat.s
@@ -0,0 +1,477 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck %s --check-prefix=CIVI --check-prefix=CI
+// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=CIVI
+
+// FIXME: These instructions give an 'invalid operand' error on SI and should
+// instead be reporting an 'instruction not supported' error.
+
+// XUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=NOVI
+// XUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI
+// XUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI
+
+//===----------------------------------------------------------------------===//
+// Operands
+//===----------------------------------------------------------------------===//
+
+flat_load_dword v1, v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] ; encoding: [0x00,0x00,0x30,0xdc,0x03,0x00,0x00,0x01]
+
+flat_load_dword v1, v[3:4] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] glc ; encoding: [0x00,0x00,0x31,0xdc,0x03,0x00,0x00,0x01]
+
+flat_load_dword v1, v[3:4] glc slc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x00,0x01]
+
+flat_load_dword v1, v[3:4] glc tfe
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x31,0xdc,0x03,0x00,0x80,0x01]
+
+flat_load_dword v1, v[3:4] glc slc tfe
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01]
+
+flat_load_dword v1, v[3:4] glc tfe slc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01]
+
+flat_load_dword v1, v[3:4] slc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] slc ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x00,0x01]
+
+flat_load_dword v1, v[3:4] slc glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x00,0x01]
+
+flat_load_dword v1, v[3:4] slc tfe
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x80,0x01]
+
+flat_load_dword v1, v[3:4] slc glc tfe
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01]
+
+flat_load_dword v1, v[3:4] slc tfe glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01]
+
+flat_load_dword v1, v[3:4] tfe
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] tfe ; encoding: [0x00,0x00,0x30,0xdc,0x03,0x00,0x80,0x01]
+
+flat_load_dword v1, v[3:4] tfe glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x31,0xdc,0x03,0x00,0x80,0x01]
+
+flat_load_dword v1, v[3:4] tfe slc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x80,0x01]
+
+flat_load_dword v1, v[3:4] tfe glc slc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01]
+
+flat_load_dword v1, v[3:4] tfe slc glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01]
+
+flat_store_dword v1, v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x00,0x00]
+
+flat_store_dword v1, v[3:4] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] glc ; encoding: [0x00,0x00,0x71,0xdc,0x03,0x01,0x00,0x00]
+
+flat_store_dword v1, v[3:4] glc slc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x00,0x00]
+
+flat_store_dword v1, v[3:4] glc tfe
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x71,0xdc,0x03,0x01,0x80,0x00]
+
+flat_store_dword v1, v[3:4] glc slc tfe
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00]
+
+flat_store_dword v1, v[3:4] glc tfe slc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00]
+
+flat_store_dword v1, v[3:4] slc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] slc ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x00,0x00]
+
+flat_store_dword v1, v[3:4] slc glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x00,0x00]
+
+flat_store_dword v1, v[3:4] slc tfe
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x80,0x00]
+
+flat_store_dword v1, v[3:4] slc glc tfe
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00]
+
+flat_store_dword v1, v[3:4] slc tfe glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00]
+
+flat_store_dword v1, v[3:4] tfe
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] tfe ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x80,0x00]
+
+flat_store_dword v1, v[3:4] tfe glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x71,0xdc,0x03,0x01,0x80,0x00]
+
+flat_store_dword v1, v[3:4] tfe slc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x80,0x00]
+
+flat_store_dword v1, v[3:4] tfe glc slc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00]
+
+flat_store_dword v1, v[3:4] tfe slc glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00]
+
+// FIXME: For atomic instructions, glc must be placed immediately following
+// the data regiser.  These forms aren't currently supported:
+// flat_atomic_add v1, v[3:4], v5 slc glc
+// flat_atomic_add v1, v[3:4], v5 slc glc tfe
+// flat_atomic_add v1, v[3:4], v5 slc tfe glc
+// flat_atomic_add v1, v[3:4], v5 tfe glc
+// flat_atomic_add v[3:4], v5 tfe glc
+// flat_atomic_add v1, v[3:4], v5 tfe glc slc
+// flat_atomic_add v1, v[3:4], v5 tfe slc glc
+
+flat_atomic_add v1 v[3:4], v5 glc slc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_add v1, v[3:4], v5 glc slc ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x00,0x01]
+
+flat_atomic_add v1 v[3:4], v5 glc tfe
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_add v1, v[3:4], v5 glc tfe ; encoding: [0x00,0x00,0xc9,0xdc,0x03,0x05,0x80,0x01]
+
+flat_atomic_add v1 v[3:4], v5 glc slc tfe
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_add v1, v[3:4], v5 glc slc tfe ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x80,0x01]
+
+flat_atomic_add v1 v[3:4], v5 glc tfe slc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_add v1, v[3:4], v5 glc slc tfe ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x80,0x01]
+
+flat_atomic_add v[3:4], v5 slc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_add v[3:4], v5 slc ; encoding: [0x00,0x00,0xca,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_add v[3:4], v5 slc tfe
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_add v[3:4], v5 slc tfe ; encoding: [0x00,0x00,0xca,0xdc,0x03,0x05,0x80,0x00]
+
+flat_atomic_add v[3:4], v5 tfe
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_add v[3:4], v5 tfe ; encoding: [0x00,0x00,0xc8,0xdc,0x03,0x05,0x80,0x00]
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+flat_load_ubyte v1, v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_ubyte v1, v[3:4] ; encoding: [0x00,0x00,0x20,0xdc,0x03,0x00,0x00,0x01]
+
+flat_load_sbyte v1, v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_sbyte v1, v[3:4] ; encoding: [0x00,0x00,0x24,0xdc,0x03,0x00,0x00,0x01]
+
+flat_load_ushort v1, v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_ushort v1, v[3:4] ; encoding: [0x00,0x00,0x28,0xdc,0x03,0x00,0x00,0x01]
+
+flat_load_sshort v1, v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_sshort v1, v[3:4] ; encoding: [0x00,0x00,0x2c,0xdc,0x03,0x00,0x00,0x01]
+
+flat_load_dword v1, v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dword v1, v[3:4] ; encoding: [0x00,0x00,0x30,0xdc,0x03,0x00,0x00,0x01]
+
+flat_load_dwordx2 v[1:2], v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dwordx2 v[1:2], v[3:4] ; encoding: [0x00,0x00,0x34,0xdc,0x03,0x00,0x00,0x01]
+
+flat_load_dwordx4 v[5:8], v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dwordx4 v[5:8], v[3:4] ; encoding: [0x00,0x00,0x38,0xdc,0x03,0x00,0x00,0x05]
+
+flat_load_dwordx3 v[5:7], v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_load_dwordx3 v[5:7], v[3:4] ; encoding: [0x00,0x00,0x3c,0xdc,0x03,0x00,0x00,0x05]
+
+flat_store_byte v1, v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_byte v1, v[3:4] ; encoding: [0x00,0x00,0x60,0xdc,0x03,0x01,0x00,0x00]
+
+flat_store_short v1, v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_short v1, v[3:4] ; encoding: [0x00,0x00,0x68,0xdc,0x03,0x01,0x00,0x00]
+
+flat_store_dword v1, v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dword v1, v[3:4] ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x00,0x00]
+
+flat_store_dwordx2 v[1:2], v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dwordx2 v[1:2], v[3:4] ; encoding: [0x00,0x00,0x74,0xdc,0x03,0x01,0x00,0x00]
+
+flat_store_dwordx4 v[5:8], v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dwordx4 v[5:8], v[3:4] ; encoding: [0x00,0x00,0x78,0xdc,0x03,0x05,0x00,0x00]
+
+flat_store_dwordx3 v[5:7], v[3:4]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_store_dwordx3 v[5:7], v[3:4] ; encoding: [0x00,0x00,0x7c,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_swap v[3:4], v5
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_swap v[3:4], v5 ; encoding: [0x00,0x00,0xc0,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_swap v1, v[3:4], v5 glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_swap v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xc1,0xdc,0x03,0x05,0x00,0x01]
+
+flat_atomic_cmpswap v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_cmpswap v[3:4], v[5:6] ; encoding: [0x00,0x00,0xc4,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_cmpswap v1, v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_cmpswap v1, v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0xc5,0xdc,0x03,0x05,0x00,0x01]
+
+flat_atomic_add v[3:4], v5
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_add v[3:4], v5 ; encoding: [0x00,0x00,0xc8,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_add v1, v[3:4], v5 glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_add v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xc9,0xdc,0x03,0x05,0x00,0x01]
+
+flat_atomic_sub v[3:4], v5
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_sub v[3:4], v5 ; encoding: [0x00,0x00,0xcc,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_sub v1, v[3:4], v5 glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_sub v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xcd,0xdc,0x03,0x05,0x00,0x01]
+
+flat_atomic_smin v[3:4], v5
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_smin v[3:4], v5 ; encoding: [0x00,0x00,0xd4,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_smin v1, v[3:4], v5 glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_smin v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xd5,0xdc,0x03,0x05,0x00,0x01]
+
+flat_atomic_umin v[3:4], v5
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_umin v[3:4], v5 ; encoding: [0x00,0x00,0xd8,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_umin v1, v[3:4], v5 glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_umin v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xd9,0xdc,0x03,0x05,0x00,0x01]
+
+flat_atomic_smax v[3:4], v5
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_smax v[3:4], v5 ; encoding: [0x00,0x00,0xdc,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_smax v1, v[3:4], v5 glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_smax v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xdd,0xdc,0x03,0x05,0x00,0x01]
+
+flat_atomic_umax v[3:4], v5
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_umax v[3:4], v5 ; encoding: [0x00,0x00,0xe0,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_umax v1, v[3:4], v5 glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_umax v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xe1,0xdc,0x03,0x05,0x00,0x01]
+
+flat_atomic_and v[3:4], v5
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_and v[3:4], v5 ; encoding: [0x00,0x00,0xe4,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_and v1, v[3:4], v5 glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_and v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xe5,0xdc,0x03,0x05,0x00,0x01]
+
+flat_atomic_or v[3:4], v5
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_or v[3:4], v5 ; encoding: [0x00,0x00,0xe8,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_or v1, v[3:4], v5 glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_or v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xe9,0xdc,0x03,0x05,0x00,0x01]
+
+flat_atomic_xor v[3:4], v5
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_xor v[3:4], v5 ; encoding: [0x00,0x00,0xec,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_xor v1, v[3:4], v5 glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_xor v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xed,0xdc,0x03,0x05,0x00,0x01]
+
+flat_atomic_inc v[3:4], v5
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_inc v[3:4], v5 ; encoding: [0x00,0x00,0xf0,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_inc v1, v[3:4], v5 glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_inc v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xf1,0xdc,0x03,0x05,0x00,0x01]
+
+flat_atomic_dec v[3:4], v5
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_dec v[3:4], v5 ; encoding: [0x00,0x00,0xf4,0xdc,0x03,0x05,0x00,0x00]
+
+flat_atomic_dec v1, v[3:4], v5 glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_dec v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xf5,0xdc,0x03,0x05,0x00,0x01]
+
+flat_atomic_swap_x2 v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_swap_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x40,0xdd,0x03,0x05,0x00,0x00]
+
+flat_atomic_swap_x2 v[1:2], v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_swap_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x41,0xdd,0x03,0x05,0x00,0x01]
+
+flat_atomic_cmpswap_x2 v[3:4], v[5:8]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_cmpswap_x2 v[3:4], v[5:8] ; encoding: [0x00,0x00,0x44,0xdd,0x03,0x05,0x00,0x00]
+
+flat_atomic_cmpswap_x2 v[1:2], v[3:4], v[5:8] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_cmpswap_x2 v[1:2], v[3:4], v[5:8] glc ; encoding: [0x00,0x00,0x45,0xdd,0x03,0x05,0x00,0x01]
+
+flat_atomic_add_x2 v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_add_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x48,0xdd,0x03,0x05,0x00,0x00]
+
+flat_atomic_add_x2 v[1:2], v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_add_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x49,0xdd,0x03,0x05,0x00,0x01]
+
+flat_atomic_sub_x2 v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_sub_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x4c,0xdd,0x03,0x05,0x00,0x00]
+
+flat_atomic_sub_x2 v[1:2], v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_sub_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x4d,0xdd,0x03,0x05,0x00,0x01]
+
+flat_atomic_smin_x2 v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_smin_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x54,0xdd,0x03,0x05,0x00,0x00]
+
+flat_atomic_smin_x2 v[1:2], v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_smin_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x55,0xdd,0x03,0x05,0x00,0x01]
+
+flat_atomic_umin_x2 v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_umin_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x58,0xdd,0x03,0x05,0x00,0x00]
+
+flat_atomic_umin_x2 v[1:2], v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_umin_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x59,0xdd,0x03,0x05,0x00,0x01] 
+
+flat_atomic_smax_x2 v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_smax_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x5c,0xdd,0x03,0x05,0x00,0x00]
+
+flat_atomic_smax_x2 v[1:2], v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_smax_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x5d,0xdd,0x03,0x05,0x00,0x01]
+
+flat_atomic_umax_x2 v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_umax_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x60,0xdd,0x03,0x05,0x00,0x00]
+
+flat_atomic_umax_x2 v[1:2], v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_umax_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x61,0xdd,0x03,0x05,0x00,0x01]
+
+flat_atomic_and_x2 v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_and_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x64,0xdd,0x03,0x05,0x00,0x00]
+
+flat_atomic_and_x2 v[1:2], v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_and_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x65,0xdd,0x03,0x05,0x00,0x01]
+
+flat_atomic_or_x2 v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_or_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x68,0xdd,0x03,0x05,0x00,0x00]
+
+flat_atomic_or_x2 v[1:2], v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_or_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x69,0xdd,0x03,0x05,0x00,0x01]
+
+flat_atomic_xor_x2 v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_xor_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x6c,0xdd,0x03,0x05,0x00,0x00]
+
+flat_atomic_xor_x2 v[1:2], v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_xor_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x6d,0xdd,0x03,0x05,0x00,0x01]
+
+flat_atomic_inc_x2 v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_inc_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x70,0xdd,0x03,0x05,0x00,0x00]
+
+flat_atomic_inc_x2 v[1:2], v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_inc_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x71,0xdd,0x03,0x05,0x00,0x01]
+
+flat_atomic_dec_x2 v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_dec_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x74,0xdd,0x03,0x05,0x00,0x00]
+
+flat_atomic_dec_x2 v[1:2], v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CIVI: flat_atomic_dec_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x75,0xdd,0x03,0x05,0x00,0x01]
+
+flat_atomic_fcmpswap_x2 v[3:4], v[5:8]
+// NOSI: error: instruction not supported on this GPU
+// CI: flat_atomic_fcmpswap_x2 v[3:4], v[5:8] ; encoding: [0x00,0x00,0x78,0xdd,0x03,0x05,0x00,0x00]
+// NOVI: error: instruction not supported on this GPU
+
+flat_atomic_fcmpswap_x2 v[1:2], v[3:4], v[5:8] glc
+// NOSI: error: instruction not supported on this GPU
+// CI: flat_atomic_fcmpswap_x2 v[1:2], v[3:4], v[5:8] glc ; encoding: [0x00,0x00,0x79,0xdd,0x03,0x05,0x00,0x01]
+// NOVI: error: instruction not supported on this GPU
+
+flat_atomic_fmin_x2 v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CI: flat_atomic_fmin_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x7c,0xdd,0x03,0x05,0x00,0x00]
+// NOVI: error: instruction not supported on this GPU
+
+flat_atomic_fmin_x2 v[1:2], v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CI: flat_atomic_fmin_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x7d,0xdd,0x03,0x05,0x00,0x01]
+// NOVI: error: instruction not supported on this GPU
+
+flat_atomic_fmax_x2 v[3:4], v[5:6]
+// NOSI: error: instruction not supported on this GPU
+// CI: flat_atomic_fmax_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x80,0xdd,0x03,0x05,0x00,0x00]
+// NOVI: error: instruction not supported on this GPU
+
+flat_atomic_fmax_x2 v[1:2], v[3:4], v[5:6] glc
+// NOSI: error: instruction not supported on this GPU
+// CI: flat_atomic_fmax_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x81,0xdd,0x03,0x05,0x00,0x01]
+// NOVI: error: instruction not supported on this GPU
diff --git a/test/MC/AMDGPU/lit.local.cfg b/test/MC/AMDGPU/lit.local.cfg
new file mode 100644
index 00000000000..2a665f06be7
--- /dev/null
+++ b/test/MC/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/test/MC/AMDGPU/mubuf.s b/test/MC/AMDGPU/mubuf.s
new file mode 100644
index 00000000000..78d365abef1
--- /dev/null
+++ b/test/MC/AMDGPU/mubuf.s
@@ -0,0 +1,352 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Test for different operand combinations
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// load - immediate offset only
+//===----------------------------------------------------------------------===//
+
+buffer_load_dword v1, s[4:7], s1
+// CHECK: buffer_load_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_load_dword v1, s[4:7], s1 offset:4
+// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 ; encoding: [0x04,0x00,0x30,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_load_dword v1, s[4:7], s1 offset:4 glc
+// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 glc ; encoding: [0x04,0x40,0x30,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_load_dword v1, s[4:7], s1 offset:4 slc
+// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 slc ; encoding: [0x04,0x00,0x30,0xe0,0x00,0x01,0x41,0x01]
+
+buffer_load_dword v1, s[4:7], s1 offset:4 tfe
+// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 tfe ; encoding: [0x04,0x00,0x30,0xe0,0x00,0x01,0x81,0x01]
+
+buffer_load_dword v1, s[4:7], s1 tfe glc
+// CHECK: buffer_load_dword v1, s[4:7], s1 glc tfe ; encoding: [0x00,0x40,0x30,0xe0,0x00,0x01,0x81,0x01]
+
+buffer_load_dword v1, s[4:7], s1 offset:4 glc tfe slc
+// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x30,0xe0,0x00,0x01,0xc1,0x01]
+
+buffer_load_dword v1, s[4:7], s1 glc tfe slc offset:4
+// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x30,0xe0,0x00,0x01,0xc1,0x01]
+
+//===----------------------------------------------------------------------===//
+// load - vgpr offset
+//===----------------------------------------------------------------------===//
+
+buffer_load_dword v1, v2, s[4:7], s1 offen
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen ; encoding: [0x00,0x10,0x30,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_load_dword v1, v2, s[4:7], s1 offen offset:4
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 ; encoding: [0x04,0x10,0x30,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen  offset:4 glc ; encoding: [0x04,0x50,0x30,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 slc
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 slc ; encoding: [0x04,0x10,0x30,0xe0,0x02,0x01,0x41,0x01]
+
+buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 tfe
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 tfe ; encoding: [0x04,0x10,0x30,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_load_dword v1, v2, s[4:7], s1 offen tfe glc
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen glc tfe ; encoding: [0x00,0x50,0x30,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc tfe slc
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x30,0xe0,0x02,0x01,0xc1,0x01]
+
+buffer_load_dword v1, v2, s[4:7], s1 offen glc tfe slc offset:4
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x30,0xe0,0x02,0x01,0xc1,0x01]
+
+//===----------------------------------------------------------------------===//
+// load - vgpr index
+//===----------------------------------------------------------------------===//
+
+buffer_load_dword v1, v2, s[4:7], s1 idxen
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen ; encoding: [0x00,0x20,0x30,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 ; encoding: [0x04,0x20,0x30,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc ; encoding: [0x04,0x60,0x30,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 slc
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 slc ; encoding: [0x04,0x20,0x30,0xe0,0x02,0x01,0x41,0x01]
+
+buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 tfe
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 tfe ; encoding: [0x04,0x20,0x30,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_load_dword v1, v2, s[4:7], s1 idxen tfe glc
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen glc tfe ; encoding: [0x00,0x60,0x30,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc tfe slc
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x30,0xe0,0x02,0x01,0xc1,0x01]
+
+buffer_load_dword v1, v2, s[4:7], s1 idxen glc tfe slc offset:4
+// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x30,0xe0,0x02,0x01,0xc1,0x01]
+
+//===----------------------------------------------------------------------===//
+// load - vgpr index and offset
+//===----------------------------------------------------------------------===//
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen ; encoding: [0x00,0x30,0x30,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 ; encoding: [0x04,0x30,0x30,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc ; encoding: [0x04,0x70,0x30,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc ; encoding: [0x04,0x30,0x30,0xe0,0x02,0x01,0x41,0x01]
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe ; encoding: [0x04,0x30,0x30,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen tfe glc
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe ; encoding: [0x00,0x70,0x30,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc tfe slc
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x30,0xe0,0x02,0x01,0xc1,0x01]
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe slc offset:4
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x30,0xe0,0x02,0x01,0xc1,0x01]
+
+//===----------------------------------------------------------------------===//
+// load - addr64
+//===----------------------------------------------------------------------===//
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 addr64
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 ; encoding: [0x00,0x80,0x30,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc ; encoding: [0x04,0xc0,0x30,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc ; encoding: [0x04,0x80,0x30,0xe0,0x02,0x01,0x41,0x01]
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe ; encoding: [0x04,0x80,0x30,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 tfe glc
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe ; encoding: [0x00,0xc0,0x30,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc tfe slc
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x30,0xe0,0x02,0x01,0xc1,0x01]
+
+buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe slc offset:4
+// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x30,0xe0,0x02,0x01,0xc1,0x01]
+
+//===----------------------------------------------------------------------===//
+// store - immediate offset only
+//===----------------------------------------------------------------------===//
+
+buffer_store_dword v1, s[4:7], s1
+// CHECK: buffer_store_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_store_dword v1, s[4:7], s1 offset:4
+// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 ; encoding: [0x04,0x00,0x70,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_store_dword v1, s[4:7], s1 offset:4 glc
+// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 glc ; encoding: [0x04,0x40,0x70,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_store_dword v1, s[4:7], s1 offset:4 slc
+// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 slc ; encoding: [0x04,0x00,0x70,0xe0,0x00,0x01,0x41,0x01]
+
+buffer_store_dword v1, s[4:7], s1 offset:4 tfe
+// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 tfe ; encoding: [0x04,0x00,0x70,0xe0,0x00,0x01,0x81,0x01]
+
+buffer_store_dword v1, s[4:7], s1 tfe glc
+// CHECK: buffer_store_dword v1, s[4:7], s1 glc tfe ; encoding: [0x00,0x40,0x70,0xe0,0x00,0x01,0x81,0x01]
+
+buffer_store_dword v1, s[4:7], s1 offset:4 glc tfe slc
+// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x70,0xe0,0x00,0x01,0xc1,0x01]
+
+buffer_store_dword v1, s[4:7], s1 glc tfe slc offset:4
+// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x70,0xe0,0x00,0x01,0xc1,0x01]
+
+//===----------------------------------------------------------------------===//
+// store - vgpr offset
+//===----------------------------------------------------------------------===//
+
+buffer_store_dword v1, v2, s[4:7], s1 offen
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen ; encoding: [0x00,0x10,0x70,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_store_dword v1, v2, s[4:7], s1 offen offset:4
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 ; encoding: [0x04,0x10,0x70,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen  offset:4 glc ; encoding: [0x04,0x50,0x70,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 slc
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 slc ; encoding: [0x04,0x10,0x70,0xe0,0x02,0x01,0x41,0x01]
+
+buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 tfe
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 tfe ; encoding: [0x04,0x10,0x70,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_store_dword v1, v2, s[4:7], s1 offen tfe glc
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen glc tfe ; encoding: [0x00,0x50,0x70,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc tfe slc
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x70,0xe0,0x02,0x01,0xc1,0x01]
+
+buffer_store_dword v1, v2, s[4:7], s1 offen glc tfe slc offset:4
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x70,0xe0,0x02,0x01,0xc1,0x01]
+
+//===----------------------------------------------------------------------===//
+// store - vgpr index
+//===----------------------------------------------------------------------===//
+
+buffer_store_dword v1, v2, s[4:7], s1 idxen
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen ; encoding: [0x00,0x20,0x70,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 ; encoding: [0x04,0x20,0x70,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc ; encoding: [0x04,0x60,0x70,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 slc
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 slc ; encoding: [0x04,0x20,0x70,0xe0,0x02,0x01,0x41,0x01]
+
+buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 tfe
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 tfe ; encoding: [0x04,0x20,0x70,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_store_dword v1, v2, s[4:7], s1 idxen tfe glc
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen glc tfe ; encoding: [0x00,0x60,0x70,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc tfe slc
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x70,0xe0,0x02,0x01,0xc1,0x01]
+
+buffer_store_dword v1, v2, s[4:7], s1 idxen glc tfe slc offset:4
+// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x70,0xe0,0x02,0x01,0xc1,0x01]
+
+//===----------------------------------------------------------------------===//
+// store - vgpr index and offset
+//===----------------------------------------------------------------------===//
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen ; encoding: [0x00,0x30,0x70,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 ; encoding: [0x04,0x30,0x70,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc ; encoding: [0x04,0x70,0x70,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc ; encoding: [0x04,0x30,0x70,0xe0,0x02,0x01,0x41,0x01]
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe ; encoding: [0x04,0x30,0x70,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen tfe glc
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe ; encoding: [0x00,0x70,0x70,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc tfe slc
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x70,0xe0,0x02,0x01,0xc1,0x01]
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe slc offset:4
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x70,0xe0,0x02,0x01,0xc1,0x01]
+
+//===----------------------------------------------------------------------===//
+// store - addr64
+//===----------------------------------------------------------------------===//
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 addr64
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 ; encoding: [0x00,0x80,0x70,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc ; encoding: [0x04,0xc0,0x70,0xe0,0x02,0x01,0x01,0x01]
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc ; encoding: [0x04,0x80,0x70,0xe0,0x02,0x01,0x41,0x01]
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe ; encoding: [0x04,0x80,0x70,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 tfe glc
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe ; encoding: [0x00,0xc0,0x70,0xe0,0x02,0x01,0x81,0x01]
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc tfe slc
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x70,0xe0,0x02,0x01,0xc1,0x01]
+
+buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe slc offset:4
+// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x70,0xe0,0x02,0x01,0xc1,0x01]
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+buffer_load_format_x v1, s[4:7], s1
+// CHECK: buffer_load_format_x v1, s[4:7], s1 ; encoding: [0x00,0x00,0x00,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_load_format_xy v[1:2], s[4:7], s1
+// CHECK: buffer_load_format_xy v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x04,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_load_format_xyz v[1:3], s[4:7], s1
+// CHECK: buffer_load_format_xyz v[1:3], s[4:7], s1 ; encoding: [0x00,0x00,0x08,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_load_format_xyzw v[1:4], s[4:7], s1
+// CHECK: buffer_load_format_xyzw v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x0c,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_store_format_x v1, s[4:7], s1
+// CHECK: buffer_store_format_x v1, s[4:7], s1 ; encoding: [0x00,0x00,0x10,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_store_format_xy v[1:2], s[4:7], s1
+// CHECK: buffer_store_format_xy v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x14,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_store_format_xyz v[1:3], s[4:7], s1
+// CHECK: buffer_store_format_xyz v[1:3], s[4:7], s1 ; encoding: [0x00,0x00,0x18,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_store_format_xyzw v[1:4], s[4:7], s1
+// CHECK: buffer_store_format_xyzw v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x1c,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_load_ubyte v1, s[4:7], s1
+// CHECK: buffer_load_ubyte v1, s[4:7], s1 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_load_sbyte v1, s[4:7], s1
+// CHECK: buffer_load_sbyte v1, s[4:7], s1 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_load_ushort v1, s[4:7], s1
+// CHECK: buffer_load_ushort v1, s[4:7], s1 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_load_sshort v1, s[4:7], s1
+// CHECK: buffer_load_sshort v1, s[4:7], s1 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_load_dword v1, s[4:7], s1
+// CHECK: buffer_load_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_load_dwordx2 v[1:2], s[4:7], s1
+// CHECK: buffer_load_dwordx2 v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_load_dwordx4 v[1:4], s[4:7], s1
+// CHECK: buffer_load_dwordx4 v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x38,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_store_byte v1, s[4:7], s1
+// CHECK: buffer_store_byte v1, s[4:7], s1 ; encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_store_short v1, s[4:7], s1
+// CHECK: buffer_store_short v1, s[4:7], s1 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_store_dword v1 s[4:7], s1
+// CHECK: buffer_store_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_store_dwordx2 v[1:2], s[4:7], s1
+// CHECK: buffer_store_dwordx2 v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x74,0xe0,0x00,0x01,0x01,0x01]
+
+buffer_store_dwordx4 v[1:4], s[4:7], s1
+// CHECK: buffer_store_dwordx4 v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x78,0xe0,0x00,0x01,0x01,0x01]
+
+// TODO: Atomics
diff --git a/test/MC/AMDGPU/smrd.s b/test/MC/AMDGPU/smrd.s
new file mode 100644
index 00000000000..b67abf7e689
--- /dev/null
+++ b/test/MC/AMDGPU/smrd.s
@@ -0,0 +1,32 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
+
+s_load_dword s1, s[2:3], 1
+// CHECK: s_load_dword s1, s[2:3], 0x1 ; encoding: [0x01,0x83,0x00,0xc0]
+
+s_load_dword s1, s[2:3], s4
+// CHECK: s_load_dword s1, s[2:3], s4 ; encoding: [0x04,0x82,0x00,0xc0]
+
+s_load_dwordx2 s[2:3], s[2:3], 1
+// CHECK: s_load_dwordx2 s[2:3], s[2:3], 0x1 ; encoding: [0x01,0x03,0x41,0xc0]
+
+s_load_dwordx2 s[2:3], s[2:3], s4
+// CHECK: s_load_dwordx2 s[2:3], s[2:3], s4 ; encoding: [0x04,0x02,0x41,0xc0]
+
+s_load_dwordx4 s[4:7], s[2:3], 1
+// CHECK: s_load_dwordx4 s[4:7], s[2:3], 0x1 ; encoding: [0x01,0x03,0x82,0xc0]
+
+s_load_dwordx4 s[4:7], s[2:3], s4
+// CHECK: s_load_dwordx4 s[4:7], s[2:3], s4 ; encoding: [0x04,0x02,0x82,0xc0]
+
+s_load_dwordx8 s[8:15], s[2:3], 1
+// CHECK: s_load_dwordx8 s[8:15], s[2:3], 0x1 ; encoding: [0x01,0x03,0xc4,0xc0]
+
+s_load_dwordx8 s[8:15], s[2:3], s4
+// CHECK: s_load_dwordx8 s[8:15], s[2:3], s4 ; encoding: [0x04,0x02,0xc4,0xc0]
+
+s_load_dwordx16 s[16:31], s[2:3], 1
+// CHECK: s_load_dwordx16 s[16:31], s[2:3], 0x1 ; encoding: [0x01,0x03,0x08,0xc1]
+
+s_load_dwordx16 s[16:31], s[2:3], s4
+// CHECK: s_load_dwordx16 s[16:31], s[2:3], s4 ; encoding: [0x04,0x02,0x08,0xc1]
diff --git a/test/MC/AMDGPU/sop1-err.s b/test/MC/AMDGPU/sop1-err.s
new file mode 100644
index 00000000000..f892356b623
--- /dev/null
+++ b/test/MC/AMDGPU/sop1-err.s
@@ -0,0 +1,37 @@
+// RUN: not llvm-mc -arch=amdgcn %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=SI %s 2>&1 | FileCheck %s
+
+s_mov_b32 v1, s2
+// CHECK: error: invalid operand for instruction
+
+s_mov_b32 s1, v0
+// CHECK: error: invalid operand for instruction
+
+s_mov_b32 s[1:2], s0
+// CHECK: error: invalid operand for instruction
+
+s_mov_b32 s0, s[1:2]
+// CHECK: error: invalid operand for instruction
+
+s_mov_b32 s220, s0
+// CHECK: error: invalid operand for instruction
+
+s_mov_b32 s0, s220
+// CHECK: error: invalid operand for instruction
+
+s_mov_b64 s1, s[0:1]
+// CHECK: error: invalid operand for instruction
+
+s_mov_b64 s[0:1], s1
+// CHECK: error: invalid operand for instruction
+
+// Immediate greater than 32-bits
+s_mov_b32 s1, 0xfffffffff
+// CHECK: error: invalid immediate: only 32-bit values are legal
+
+// Immediate greater than 32-bits
+s_mov_b64 s[0:1], 0xfffffffff
+// CHECK: error: invalid immediate: only 32-bit values are legal
+
+// Out of range register
+s_mov_b32 s
diff --git a/test/MC/AMDGPU/sop1.s b/test/MC/AMDGPU/sop1.s
new file mode 100644
index 00000000000..92ca73f2500
--- /dev/null
+++ b/test/MC/AMDGPU/sop1.s
@@ -0,0 +1,177 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
+
+s_mov_b32 s1, s2
+// CHECK: s_mov_b32 s1, s2 ; encoding: [0x02,0x03,0x81,0xbe]
+
+s_mov_b32 s1, 1
+// CHECK: s_mov_b32 s1, 1 ; encoding: [0x81,0x03,0x81,0xbe]
+
+s_mov_b32 s1, 100
+// CHECK: s_mov_b32 s1, 0x64 ; encoding: [0xff,0x03,0x81,0xbe,0x64,0x00,0x00,0x00]
+
+s_mov_b64 s[2:3], s[4:5]
+// CHECK: s_mov_b64 s[2:3], s[4:5] ; encoding: [0x04,0x04,0x82,0xbe]
+
+s_mov_b64 s[2:3], 0xffffffffffffffff
+// CHECK: s_mov_b64 s[2:3], -1 ; encoding: [0xc1,0x04,0x82,0xbe]
+
+s_cmov_b32 s1, 200
+// CHECK: s_cmov_b32 s1, 0xc8 ; encoding: [0xff,0x05,0x81,0xbe,0xc8,0x00,0x00,0x00]
+
+s_cmov_b32 s1, 1.0
+// CHECK: s_cmov_b32 s1, 1.0 ; encoding: [0xf2,0x05,0x81,0xbe]
+
+//s_cmov_b64 s[2:3], 1.0
+//CHECK-FIXME: s_cmov_b64 s[2:3], 1.0 ; encoding: [0xf2,0x05,0x82,0xb3]
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+s_mov_b32 s1, s2
+// CHECK: s_mov_b32 s1, s2 ; encoding: [0x02,0x03,0x81,0xbe]
+
+s_mov_b64 s[2:3], s[4:5]
+// CHECK: s_mov_b64 s[2:3], s[4:5] ; encoding: [0x04,0x04,0x82,0xbe]
+
+s_cmov_b32 s1, s2
+// CHECK: s_cmov_b32 s1, s2 ; encoding: [0x02,0x05,0x81,0xbe]
+
+s_cmov_b64 s[2:3], s[4:5]
+// CHECK: s_cmov_b64 s[2:3], s[4:5] ; encoding: [0x04,0x06,0x82,0xbe]
+
+s_not_b32 s1, s2
+// CHECK: s_not_b32 s1, s2 ; encoding: [0x02,0x07,0x81,0xbe]
+
+s_not_b64 s[2:3], s[4:5]
+// CHECK: s_not_b64 s[2:3], s[4:5] ; encoding: [0x04,0x08,0x82,0xbe]
+
+s_wqm_b32 s1, s2
+// CHECK: s_wqm_b32 s1, s2 ; encoding: [0x02,0x09,0x81,0xbe]
+
+s_wqm_b64 s[2:3], s[4:5]
+// CHECK: s_wqm_b64 s[2:3], s[4:5] ; encoding: [0x04,0x0a,0x82,0xbe]
+
+s_brev_b32 s1, s2
+// CHECK: s_brev_b32 s1, s2 ; encoding: [0x02,0x0b,0x81,0xbe]
+
+s_brev_b64 s[2:3], s[4:5]
+// CHECK: s_brev_b64 s[2:3], s[4:5] ; encoding: [0x04,0x0c,0x82,0xbe]
+
+s_bcnt0_i32_b32 s1, s2
+// CHECK: s_bcnt0_i32_b32 s1, s2 ; encoding: [0x02,0x0d,0x81,0xbe]
+
+s_bcnt0_i32_b64 s1, s[2:3]
+// CHECK: s_bcnt0_i32_b64 s1, s[2:3] ; encoding: [0x02,0x0e,0x81,0xbe]
+
+s_bcnt1_i32_b32 s1, s2
+// CHECK: s_bcnt1_i32_b32 s1, s2 ; encoding: [0x02,0x0f,0x81,0xbe]
+
+s_bcnt1_i32_b64 s1, s[2:3]
+// CHECK: s_bcnt1_i32_b64 s1, s[2:3] ; encoding: [0x02,0x10,0x81,0xbe]
+
+s_ff0_i32_b32 s1, s2
+// CHECK: s_ff0_i32_b32 s1, s2 ; encoding: [0x02,0x11,0x81,0xbe]
+
+s_ff0_i32_b64 s1, s[2:3]
+// CHECK: s_ff0_i32_b64 s1, s[2:3] ; encoding: [0x02,0x12,0x81,0xbe]
+
+s_ff1_i32_b32 s1, s2
+// CHECK: s_ff1_i32_b32 s1, s2 ; encoding: [0x02,0x13,0x81,0xbe]
+
+s_ff1_i32_b64 s1, s[2:3]
+// CHECK: s_ff1_i32_b64 s1, s[2:3] ; encoding: [0x02,0x14,0x81,0xbe]
+
+s_flbit_i32_b32 s1, s2
+// CHECK: s_flbit_i32_b32 s1, s2 ; encoding: [0x02,0x15,0x81,0xbe]
+
+s_flbit_i32_b64 s1, s[2:3]
+// CHECK: s_flbit_i32_b64 s1, s[2:3] ; encoding: [0x02,0x16,0x81,0xbe]
+
+s_flbit_i32 s1, s2
+// CHECK: s_flbit_i32 s1, s2 ; encoding: [0x02,0x17,0x81,0xbe]
+
+s_flbit_i32_i64 s1, s[2:3]
+// CHECK: s_flbit_i32_i64 s1, s[2:3] ; encoding: [0x02,0x18,0x81,0xbe]
+
+s_sext_i32_i8 s1, s2
+// CHECK: s_sext_i32_i8 s1, s2 ; encoding: [0x02,0x19,0x81,0xbe]
+
+s_sext_i32_i16 s1, s2
+// CHECK: s_sext_i32_i16 s1, s2 ; encoding: [0x02,0x1a,0x81,0xbe]
+
+s_bitset0_b32 s1, s2
+// CHECK: s_bitset0_b32 s1, s2 ; encoding: [0x02,0x1b,0x81,0xbe]
+
+s_bitset0_b64 s[2:3], s[4:5]
+// CHECK: s_bitset0_b64 s[2:3], s[4:5] ; encoding: [0x04,0x1c,0x82,0xbe]
+
+s_bitset1_b32 s1, s2
+// CHECK: s_bitset1_b32 s1, s2 ; encoding: [0x02,0x1d,0x81,0xbe]
+
+s_bitset1_b64 s[2:3], s[4:5]
+// CHECK: s_bitset1_b64 s[2:3], s[4:5] ; encoding: [0x04,0x1e,0x82,0xbe]
+
+s_getpc_b64 s[2:3]
+// CHECK: s_getpc_b64 s[2:3] ; encoding: [0x00,0x1f,0x82,0xbe]
+
+s_setpc_b64 s[2:3], s[4:5]
+// CHECK: s_setpc_b64 s[2:3], s[4:5] ; encoding: [0x04,0x20,0x82,0xbe]
+
+s_swappc_b64 s[2:3], s[4:5]
+// CHECK: s_swappc_b64 s[2:3], s[4:5] ; encoding: [0x04,0x21,0x82,0xbe]
+
+s_rfe_b64 s[2:3], s[4:5]
+// CHECK: s_rfe_b64 s[2:3], s[4:5] ; encoding: [0x04,0x22,0x82,0xbe]
+
+s_and_saveexec_b64 s[2:3], s[4:5]
+// CHECK: s_and_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x24,0x82,0xbe]
+
+s_or_saveexec_b64 s[2:3], s[4:5]
+// CHECK: s_or_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x25,0x82,0xbe]
+
+s_xor_saveexec_b64 s[2:3], s[4:5]
+// CHECK: s_xor_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x26,0x82,0xbe]
+
+s_andn2_saveexec_b64 s[2:3], s[4:5]
+// CHECK: s_andn2_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x27,0x82,0xbe]
+
+s_orn2_saveexec_b64 s[2:3], s[4:5]
+// CHECK: s_orn2_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x28,0x82,0xbe]
+
+s_nand_saveexec_b64 s[2:3], s[4:5]
+// CHECK: s_nand_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x29,0x82,0xbe]
+
+s_nor_saveexec_b64 s[2:3], s[4:5]
+// CHECK: s_nor_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2a,0x82,0xbe]
+
+s_xnor_saveexec_b64 s[2:3], s[4:5]
+// CHECK: s_xnor_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2b,0x82,0xbe]
+
+s_quadmask_b32 s1, s2
+// CHECK: s_quadmask_b32 s1, s2 ; encoding: [0x02,0x2c,0x81,0xbe]
+
+s_quadmask_b64 s[2:3], s[4:5]
+// CHECK: s_quadmask_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2d,0x82,0xbe]
+
+s_movrels_b32 s1, s2
+// CHECK: s_movrels_b32 s1, s2 ; encoding: [0x02,0x2e,0x81,0xbe]
+
+s_movrels_b64 s[2:3], s[4:5]
+// CHECK: s_movrels_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2f,0x82,0xbe]
+
+s_movreld_b32 s1, s2
+// CHECK: s_movreld_b32 s1, s2 ; encoding: [0x02,0x30,0x81,0xbe]
+
+s_movreld_b64 s[2:3], s[4:5]
+// CHECK: s_movreld_b64 s[2:3], s[4:5] ; encoding: [0x04,0x31,0x82,0xbe]
+
+s_cbranch_join s[4:5]
+// CHECK: s_cbranch_join s[4:5] ; encoding: [0x04,0x32,0x80,0xbe]
+
+s_abs_i32 s1, s2
+// CHECK: s_abs_i32 s1, s2 ; encoding: [0x02,0x34,0x81,0xbe]
+
+s_mov_fed_b32 s1, s2
+// CHECK: s_mov_fed_b32 s1, s2 ; encoding: [0x02,0x35,0x81,0xbe]
diff --git a/test/MC/AMDGPU/sop2.s b/test/MC/AMDGPU/sop2.s
new file mode 100644
index 00000000000..9a7a1c01064
--- /dev/null
+++ b/test/MC/AMDGPU/sop2.s
@@ -0,0 +1,131 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
+
+// CHECK: s_add_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x80]
+s_add_u32 s1, s2, s3
+
+// CHECK: s_sub_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x80]
+s_sub_u32 s1, s2, s3
+
+// CHECK: s_add_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x81]
+s_add_i32 s1, s2, s3
+
+// CHECK: s_sub_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x81]
+s_sub_i32 s1, s2, s3
+
+// CHECK: s_addc_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x82]
+s_addc_u32 s1, s2, s3
+
+// CHECK: s_subb_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x82]
+s_subb_u32 s1, s2, s3
+
+// CHECK: s_min_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x83]
+s_min_i32 s1, s2, s3
+
+// CHECK: s_min_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x83]
+s_min_u32 s1, s2, s3
+
+// CHECK: s_max_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x84]
+s_max_i32 s1, s2, s3
+
+// CHECK: s_max_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x84]
+s_max_u32 s1, s2, s3
+
+// CHECK: s_cselect_b32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x85]
+s_cselect_b32 s1, s2, s3
+
+// CHECK: s_cselect_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x85]
+s_cselect_b64 s[2:3], s[4:5], s[6:7]
+
+// CHECK: s_and_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x87]
+s_and_b32 s2, s4, s6
+
+// CHECK: s_and_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x87]
+s_and_b64 s[2:3], s[4:5], s[6:7]
+
+// CHECK: s_or_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x88]
+s_or_b32 s2, s4, s6
+
+// CHECK: s_or_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x88]
+s_or_b64 s[2:3], s[4:5], s[6:7]
+
+// CHECK: s_xor_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x89]
+s_xor_b32 s2, s4, s6
+
+// CHECK: s_xor_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x89]
+s_xor_b64 s[2:3], s[4:5], s[6:7]
+
+// CHECK: s_andn2_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8a]
+s_andn2_b32 s2, s4, s6
+
+// CHECK: s_andn2_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8a]
+s_andn2_b64 s[2:3], s[4:5], s[6:7]
+
+// CHECK: s_orn2_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8b]
+s_orn2_b32 s2, s4, s6
+
+// CHECK: s_orn2_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8b]
+s_orn2_b64 s[2:3], s[4:5], s[6:7]
+
+// CHECK: s_nand_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8c]
+s_nand_b32 s2, s4, s6
+
+// CHECK: s_nand_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8c]
+s_nand_b64 s[2:3], s[4:5], s[6:7]
+
+// CHECK: s_nor_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8d]
+s_nor_b32 s2, s4, s6
+
+// CHECK: s_nor_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8d]
+s_nor_b64 s[2:3], s[4:5], s[6:7]
+
+// CHECK: s_xnor_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8e]
+s_xnor_b32 s2, s4, s6
+
+// CHECK: s_xnor_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8e]
+s_xnor_b64 s[2:3], s[4:5], s[6:7]
+
+// CHECK: s_lshl_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8f]
+s_lshl_b32 s2, s4, s6
+
+// CHECK: s_lshl_b64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x82,0x8f]
+s_lshl_b64 s[2:3], s[4:5], s6
+
+// CHECK: s_lshr_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x90]
+s_lshr_b32 s2, s4, s6
+
+// CHECK: s_lshr_b64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x82,0x90]
+s_lshr_b64 s[2:3], s[4:5], s6
+
+// CHECK: s_ashr_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x91]
+s_ashr_i32 s2, s4, s6
+
+// CHECK: s_ashr_i64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x82,0x91]
+s_ashr_i64 s[2:3], s[4:5], s6
+
+// CHECK: s_bfm_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x92]
+s_bfm_b32 s2, s4, s6
+
+// CHECK: s_bfm_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x92]
+s_bfm_b64 s[2:3], s[4:5], s[6:7]
+
+// CHECK: s_mul_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x93]
+s_mul_i32 s2, s4, s6
+
+// CHECK: s_bfe_u32 s2, s4, s6 ; encoding: [0x04,0x06,0x82,0x93]
+s_bfe_u32 s2, s4, s6
+
+// CHECK: s_bfe_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x94]
+s_bfe_i32 s2, s4, s6
+
+// CHECK: s_bfe_u64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x94]
+s_bfe_u64 s[2:3], s[4:5], s[6:7]
+
+// CHECK: s_bfe_i64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x02,0x95]
+s_bfe_i64 s[2:3], s[4:5], s6
+
+// CHECK: s_cbranch_g_fork s[4:5], s[6:7] ; encoding: [0x04,0x06,0x80,0x95]
+s_cbranch_g_fork s[4:5], s[6:7]
+
+// CHECK: s_absdiff_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x96]
+s_absdiff_i32 s2, s4, s6
diff --git a/test/MC/AMDGPU/sopc.s b/test/MC/AMDGPU/sopc.s
new file mode 100644
index 00000000000..0899c1a2eed
--- /dev/null
+++ b/test/MC/AMDGPU/sopc.s
@@ -0,0 +1,9 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+s_cmp_eq_i32 s1, s2
+// CHECK: s_cmp_eq_i32 s1, s2 ; encoding: [0x01,0x02,0x00,0xbf]
diff --git a/test/MC/AMDGPU/sopk.s b/test/MC/AMDGPU/sopk.s
new file mode 100644
index 00000000000..6c27aaccb80
--- /dev/null
+++ b/test/MC/AMDGPU/sopk.s
@@ -0,0 +1,66 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+s_movk_i32 s2, 0x6
+// CHECK: s_movk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb0]
+
+s_cmovk_i32 s2, 0x6
+// CHECK: s_cmovk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb1]
+
+s_cmpk_eq_i32 s2, 0x6
+// CHECK: s_cmpk_eq_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb1]
+
+s_cmpk_lg_i32 s2, 0x6
+// CHECK: s_cmpk_lg_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb2]
+
+s_cmpk_gt_i32 s2, 0x6
+// CHECK: s_cmpk_gt_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb2]
+
+s_cmpk_ge_i32 s2, 0x6
+// CHECK: s_cmpk_ge_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb3]
+
+s_cmpk_lt_i32 s2, 0x6
+// CHECK: s_cmpk_lt_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb3]
+
+s_cmpk_le_i32 s2, 0x6
+// CHECK: s_cmpk_le_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb4]
+
+s_cmpk_eq_u32 s2, 0x6
+// CHECK: s_cmpk_eq_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb4]
+
+s_cmpk_lg_u32 s2, 0x6
+// CHECK: s_cmpk_lg_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb5]
+
+s_cmpk_gt_u32 s2, 0x6
+// CHECK: s_cmpk_gt_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb5]
+
+s_cmpk_ge_u32 s2, 0x6
+// CHECK: s_cmpk_ge_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb6]
+
+s_cmpk_lt_u32 s2, 0x6
+// CHECK: s_cmpk_lt_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb6]
+
+s_cmpk_le_u32 s2, 0x6
+// CHECK: s_cmpk_le_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb7]
+
+s_addk_i32 s2, 0x6
+// CHECK: s_addk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb7]
+
+s_mulk_i32 s2, 0x6
+// CHECK: s_mulk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb8]
+
+s_cbranch_i_fork s[2:3], 0x6
+// CHECK: s_cbranch_i_fork s[2:3], 0x6 ; encoding: [0x06,0x00,0x82,0xb8]
+
+s_getreg_b32 s2, 0x6
+// CHECK: s_getreg_b32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb9]
+
+s_setreg_b32 s2, 0x6
+// CHECK: s_setreg_b32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb9]
+
+s_setreg_imm32_b32 0xff, 0x6
+// CHECK: s_setreg_imm32_b32 0xff, 0x6 ; encoding: [0x06,0x00,0x80,0xba,0xff,0x00,0x00,0x00]
diff --git a/test/MC/AMDGPU/sopp.s b/test/MC/AMDGPU/sopp.s
new file mode 100644
index 00000000000..b072c16fdb2
--- /dev/null
+++ b/test/MC/AMDGPU/sopp.s
@@ -0,0 +1,64 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Edge Cases
+//===----------------------------------------------------------------------===//
+
+s_nop 0       // CHECK: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] 
+s_nop 0xffff  // CHECK: s_nop 0xffff ; encoding: [0xff,0xff,0x80,0xbf]
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+  s_nop 1            // CHECK: s_nop 1 ; encoding: [0x01,0x00,0x80,0xbf]
+  s_endpgm           // CHECK: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+  s_branch 2         // CHECK: s_branch 2 ; encoding: [0x02,0x00,0x82,0xbf]
+  s_cbranch_scc0 3   // CHECK: s_cbranch_scc0 3 ; encoding: [0x03,0x00,0x84,0xbf]
+  s_cbranch_scc1 4   // CHECK: s_cbranch_scc1 4 ; encoding: [0x04,0x00,0x85,0xbf]
+  s_cbranch_vccz 5   // CHECK: s_cbranch_vccz 5 ; encoding: [0x05,0x00,0x86,0xbf]
+  s_cbranch_vccnz 6  // CHECK: s_cbranch_vccnz 6 ; encoding: [0x06,0x00,0x87,0xbf]
+  s_cbranch_execz 7  // CHECK: s_cbranch_execz 7 ; encoding: [0x07,0x00,0x88,0xbf]
+  s_cbranch_execnz 8 // CHECK: s_cbranch_execnz 8 ; encoding: [0x08,0x00,0x89,0xbf]
+  s_barrier          // CHECK: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf]
+
+//===----------------------------------------------------------------------===//
+// s_waitcnt
+//===----------------------------------------------------------------------===//
+
+  s_waitcnt 0
+  // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+
+  s_waitcnt vmcnt(0) & expcnt(0) & lgkmcnt(0)
+  // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+
+  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+
+  s_waitcnt vmcnt(0), expcnt(0), lgkmcnt(0)
+  // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+
+  s_waitcnt vmcnt(1)
+  // CHECK: s_waitcnt vmcnt(1) ; encoding: [0x71,0x07,0x8c,0xbf]
+
+  s_waitcnt expcnt(2)
+  // CHECK: s_waitcnt expcnt(2) ; encoding: [0x2f,0x07,0x8c,0xbf]
+
+  s_waitcnt lgkmcnt(3)
+  // CHECK: s_waitcnt lgkmcnt(3) ; encoding: [0x7f,0x03,0x8c,0xbf]
+
+  s_waitcnt vmcnt(0), expcnt(0)
+  // CHECK: s_waitcnt vmcnt(0) expcnt(0) ; encoding: [0x00,0x07,0x8c,0xbf]
+
+
+  s_sethalt 9        // CHECK: s_sethalt 9 ; encoding: [0x09,0x00,0x8d,0xbf]
+  s_sleep 10         // CHECK: s_sleep 10 ; encoding: [0x0a,0x00,0x8e,0xbf]
+  s_setprio 1        // CHECK: s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
+  s_sendmsg 2        // CHECK: s_sendmsg Gs(nop), [m0] ; encoding: [0x02,0x00,0x90,0xbf]
+  s_sendmsghalt 3    // CHECK: s_sendmsghalt 3 ; encoding: [0x03,0x00,0x91,0xbf]
+  s_trap 4           // CHECK: s_trap 4 ; encoding: [0x04,0x00,0x92,0xbf]
+  s_icache_inv       // CHECK: s_icache_inv ; encoding: [0x00,0x00,0x93,0xbf]
+  s_incperflevel 5   // CHECK: s_incperflevel 5 ; encoding: [0x05,0x00,0x94,0xbf]
+  s_decperflevel 6   // CHECK: s_decperflevel 6 ; encoding: [0x06,0x00,0x95,0xbf]
+  s_ttracedata       // CHECK: s_ttracedata ; encoding: [0x00,0x00,0x96,0xbf]
diff --git a/test/MC/AMDGPU/vop1.s b/test/MC/AMDGPU/vop1.s
new file mode 100644
index 00000000000..d0b00fcd189
--- /dev/null
+++ b/test/MC/AMDGPU/vop1.s
@@ -0,0 +1,357 @@
+// RUN: not llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SI --check-prefix=SICI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SI --check-prefix=SICI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI --check-prefix=CIVI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI
+
+// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s -check-prefix=NOVI
+
+
+// GCN: v_nop ; encoding: [0x00,0x00,0x00,0x7e]
+v_nop
+
+// GCN: v_mov_b32_e32 v1, v2 ; encoding: [0x02,0x03,0x02,0x7e]
+v_mov_b32 v1, v2
+
+// GCN: v_readfirstlane_b32 s1, v2 ; encoding: [0x02,0x05,0x02,0x7e]
+v_readfirstlane_b32 s1, v2
+
+// GCN: v_cvt_i32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x07,0x02,0x7e]
+v_cvt_i32_f64 v1, v[2:3]
+
+// GCN: v_cvt_f64_i32_e32 v[1:2], v2 ; encoding: [0x02,0x09,0x02,0x7e]
+v_cvt_f64_i32 v[1:2], v2
+
+// GCN: v_cvt_f32_i32_e32 v1, v2 ; encoding: [0x02,0x0b,0x02,0x7e]
+v_cvt_f32_i32 v1, v2
+
+// GCN: v_cvt_f32_u32_e32 v1, v2 ; encoding: [0x02,0x0d,0x02,0x7e]
+v_cvt_f32_u32 v1, v2
+
+// GCN: v_cvt_u32_f32_e32 v1, v2 ; encoding: [0x02,0x0f,0x02,0x7e
+v_cvt_u32_f32 v1, v2
+
+// GCN: v_cvt_i32_f32_e32 v1, v2 ; encoding: [0x02,0x11,0x02,0x7e]
+v_cvt_i32_f32 v1, v2
+
+// SICI: v_mov_fed_b32_e32 v1, v2 ; encoding: [0x02,0x13,0x02,0x7e]
+// NOVI: error: instruction not supported on this GPU
+v_mov_fed_b32 v1, v2
+
+// GCN: v_cvt_f16_f32_e32 v1, v2 ; encoding: [0x02,0x15,0x02,0x7e]
+v_cvt_f16_f32 v1, v2
+
+// GCN: v_cvt_f32_f16_e32 v1, v2 ; encoding: [0x02,0x17,0x02,0x7e]
+v_cvt_f32_f16 v1, v2
+
+// GCN: v_cvt_rpi_i32_f32_e32 v1, v2 ; encoding: [0x02,0x19,0x02,0x7e]
+v_cvt_rpi_i32_f32 v1, v2
+
+// GCN: v_cvt_flr_i32_f32_e32 v1, v2 ; encoding: [0x02,0x1b,0x02,0x7e]
+v_cvt_flr_i32_f32 v1, v2
+
+// GCN: v_cvt_off_f32_i4_e32 v1, v2 ; encoding: [0x02,0x1d,0x02,0x7e]
+v_cvt_off_f32_i4_e32 v1, v2
+
+// GCN: v_cvt_f32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x1f,0x02,0x7e]
+v_cvt_f32_f64 v1, v[2:3]
+
+// GCN: v_cvt_f64_f32_e32 v[1:2], v2 ; encoding: [0x02,0x21,0x02,0x7e]
+v_cvt_f64_f32 v[1:2], v2
+
+// GCN: v_cvt_f32_ubyte0_e32 v1, v2 ; encoding: [0x02,0x23,0x02,0x7e]
+v_cvt_f32_ubyte0 v1, v2
+
+// GCN: v_cvt_f32_ubyte1_e32 v1, v2 ; encoding: [0x02,0x25,0x02,0x7e]
+v_cvt_f32_ubyte1_e32 v1, v2
+
+// GCN: v_cvt_f32_ubyte2_e32 v1, v2 ; encoding: [0x02,0x27,0x02,0x7e]
+v_cvt_f32_ubyte2 v1, v2
+
+// GCN: v_cvt_f32_ubyte3_e32 v1, v2 ; encoding: [0x02,0x29,0x02,0x7e]
+v_cvt_f32_ubyte3 v1, v2
+
+// GCN: v_cvt_u32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x2b,0x02,0x7e]
+v_cvt_u32_f64 v1, v[2:3]
+
+// GCN: v_cvt_f64_u32_e32 v[1:2], v2 ; encoding: [0x02,0x2d,0x02,0x7e]
+v_cvt_f64_u32 v[1:2], v2
+
+// NOSI: error: instruction not supported on this GPU
+// NOSI: v_trunc_f64_e32 v[1:2], v[2:3]
+// CIVI: v_trunc_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x2f,0x02,0x7e]
+v_trunc_f64_e32 v[1:2], v[2:3]
+
+// NOSI: error: instruction not supported on this GPU
+// NOSI: v_ceil_f64_e32 v[1:2], v[2:3]
+// CIVI: v_ceil_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x31,0x02,0x7e]
+v_ceil_f64_e32 v[1:2], v[2:3]
+
+// NOSI: error: instruction not supported on this GPU
+// NOSI: v_rndne_f64_e32 v[1:2], v[2:3]
+// CIVI: v_rndne_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x33,0x02,0x7e]
+v_rndne_f64_e32 v[1:2], v[2:3]
+
+// NOSI: error: instruction not supported on this GPU
+// NOSI: v_floor_f64_e32 v[1:2], v[2:3]
+// CIVI: v_floor_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x35,0x02,0x7e]
+v_floor_f64_e32 v[1:2], v[2:3]
+
+// SICI: v_fract_f32_e32 v1, v2 ; encoding: [0x02,0x41,0x02,0x7e]
+// VI:   v_fract_f32_e32 v1, v2 ; encoding: [0x02,0x37,0x02,0x7e]
+v_fract_f32 v1, v2
+
+// SICI: v_trunc_f32_e32 v1, v2 ; encoding: [0x02,0x43,0x02,0x7e]
+// VI:   v_trunc_f32_e32 v1, v2 ; encoding: [0x02,0x39,0x02,0x7e]
+v_trunc_f32 v1, v2
+
+// SICI: v_ceil_f32_e32 v1, v2 ; encoding: [0x02,0x45,0x02,0x7e]
+// VI:   v_ceil_f32_e32 v1, v2 ; encoding: [0x02,0x3b,0x02,0x7e]
+v_ceil_f32 v1, v2
+
+// SICI: v_rndne_f32_e32 v1, v2 ; encoding: [0x02,0x47,0x02,0x7e]
+// VI:   v_rndne_f32_e32 v1, v2 ; encoding: [0x02,0x3d,0x02,0x7e]
+v_rndne_f32 v1, v2
+
+// SICI: v_floor_f32_e32 v1, v2 ; encoding: [0x02,0x49,0x02,0x7e]
+// VI:   v_floor_f32_e32 v1, v2 ; encoding: [0x02,0x3f,0x02,0x7e]
+v_floor_f32_e32 v1, v2
+
+// SICI: v_exp_f32_e32 v1, v2 ; encoding: [0x02,0x4b,0x02,0x7e]
+// VI:   v_exp_f32_e32 v1, v2 ; encoding: [0x02,0x41,0x02,0x7e]
+v_exp_f32 v1, v2
+
+// SICI: v_log_clamp_f32_e32 v1, v2 ; encoding: [0x02,0x4d,0x02,0x7e]
+// NOVI: error: instruction not supported on this GPU
+// NOVI: v_log_clamp_f32 v1, v2
+v_log_clamp_f32 v1, v2
+
+// SICI: v_log_f32_e32 v1, v2 ; encoding: [0x02,0x4f,0x02,0x7e]
+// VI:   v_log_f32_e32 v1, v2 ; encoding: [0x02,0x43,0x02,0x7e]
+v_log_f32 v1, v2
+
+// SICI: v_rcp_clamp_f32_e32 v1, v2 ; encoding: [0x02,0x51,0x02,0x7e]
+// NOVI: error: instruction not supported on this GPU
+// NOVI: v_rcp_clamp_f32 v1, v2
+v_rcp_clamp_f32 v1, v2
+
+// SICI: v_rcp_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x53,0x02,0x7e]
+// NOVI: error: instruction not supported on this GPU
+// NOVI: v_rcp_legacy_f32 v1, v2
+v_rcp_legacy_f32 v1, v2
+
+// SICI: v_rcp_f32_e32 v1, v2 ; encoding: [0x02,0x55,0x02,0x7e]
+// VI:   v_rcp_f32_e32 v1, v2 ; encoding: [0x02,0x45,0x02,0x7e]
+v_rcp_f32 v1, v2
+
+// SICI: v_rcp_iflag_f32_e32 v1, v2 ; encoding: [0x02,0x57,0x02,0x7e]
+// VI:   v_rcp_iflag_f32_e32 v1, v2 ; encoding: [0x02,0x47,0x02,0x7e]
+v_rcp_iflag_f32 v1, v2
+
+// SICI: v_rsq_clamp_f32_e32 v1, v2 ; encoding: [0x02,0x59,0x02,0x7e]
+// NOVI: error: instruction not supported on this GPU
+// NOVI: v_rsq_clamp_f32 v1, v2
+v_rsq_clamp_f32 v1, v2
+
+// SICI: v_rsq_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x5b,0x02,0x7e]
+// NOVI: error: instruction not supported on this GPU
+// NOVI: v_rsq_legacy_f32 v1, v2
+v_rsq_legacy_f32 v1, v2
+
+// SICI: v_rsq_f32_e32 v1, v2 ; encoding: [0x02,0x5d,0x02,0x7e]
+// VI:   v_rsq_f32_e32 v1, v2 ; encoding: [0x02,0x49,0x02,0x7e]
+v_rsq_f32_e32 v1, v2
+
+// SICI: v_rcp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x5f,0x02,0x7e]
+// VI:   v_rcp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x4b,0x02,0x7e]
+v_rcp_f64 v[1:2], v[2:3]
+
+// SICI: v_rcp_clamp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x61,0x02,0x7e]
+// NOVI: error: instruction not supported on this GPU
+// NOVI: v_rcp_clamp_f64 v[1:2], v[2:3]
+v_rcp_clamp_f64 v[1:2], v[2:3]
+
+// SICI: v_rsq_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x63,0x02,0x7e]
+// VI:   v_rsq_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x4d,0x02,0x7e]
+v_rsq_f64 v[1:2], v[2:3]
+
+// SICI: v_rsq_clamp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x65,0x02,0x7e]
+// NOVI: error: instruction not supported on this GPU
+// NOVI: v_rsq_clamp_f64 v[1:2], v[2:3]
+v_rsq_clamp_f64 v[1:2], v[2:3]
+
+// SICI: v_sqrt_f32_e32 v1, v2 ; encoding: [0x02,0x67,0x02,0x7e]
+// VI:   v_sqrt_f32_e32 v1, v2 ; encoding: [0x02,0x4f,0x02,0x7e]
+v_sqrt_f32 v1, v2
+
+// SICI: v_sqrt_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x69,0x02,0x7e]
+// VI:   v_sqrt_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x51,0x02,0x7e]
+v_sqrt_f64 v[1:2], v[2:3]
+
+// SICI: v_sin_f32_e32 v1, v2 ; encoding: [0x02,0x6b,0x02,0x7e]
+// VI:   v_sin_f32_e32 v1, v2 ; encoding: [0x02,0x53,0x02,0x7e]
+v_sin_f32 v1, v2
+
+// SICI: v_cos_f32_e32 v1, v2 ; encoding: [0x02,0x6d,0x02,0x7e]
+// VI:   v_cos_f32_e32 v1, v2 ; encoding: [0x02,0x55,0x02,0x7e]
+v_cos_f32 v1, v2
+
+// SICI: v_not_b32_e32 v1, v2 ; encoding: [0x02,0x6f,0x02,0x7e]
+// VI:   v_not_b32_e32 v1, v2 ; encoding: [0x02,0x57,0x02,0x7e]
+v_not_b32 v1, v2
+
+// SICI: v_bfrev_b32_e32 v1, v2 ; encoding: [0x02,0x71,0x02,0x7e]
+// VI:   v_bfrev_b32_e32 v1, v2 ; encoding: [0x02,0x59,0x02,0x7e]
+v_bfrev_b32 v1, v2
+
+// SICI: v_ffbh_u32_e32 v1, v2 ; encoding: [0x02,0x73,0x02,0x7e]
+// VI:   v_ffbh_u32_e32 v1, v2 ; encoding: [0x02,0x5b,0x02,0x7e]
+v_ffbh_u32 v1, v2
+
+// SICI: v_ffbl_b32_e32 v1, v2 ; encoding: [0x02,0x75,0x02,0x7e]
+// VI:   v_ffbl_b32_e32 v1, v2 ; encoding: [0x02,0x5d,0x02,0x7e]
+v_ffbl_b32 v1, v2
+
+// SICI: v_ffbh_i32_e32 v1, v2 ; encoding: [0x02,0x77,0x02,0x7e]
+// VI:   v_ffbh_i32_e32 v1, v2 ; encoding: [0x02,0x5f,0x02,0x7e]
+v_ffbh_i32_e32 v1, v2
+
+// SICI: v_frexp_exp_i32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x79,0x02,0x7e]
+// VI:   v_frexp_exp_i32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x61,0x02,0x7e]
+v_frexp_exp_i32_f64 v1, v[2:3]
+
+// SICI: v_frexp_mant_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x7b,0x02,0x7e]
+// VI;   v_frexp_mant_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x63,0x02,0x7e]
+v_frexp_mant_f64 v[1:2], v[2:3]
+
+// SICI: v_fract_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x7d,0x02,0x7e]
+// VI:   v_fract_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x65,0x02,0x7e]
+v_fract_f64 v[1:2], v[2:3]
+
+// SICI: v_frexp_exp_i32_f32_e32 v1, v2 ; encoding: [0x02,0x7f,0x02,0x7e]
+// VI:   v_frexp_exp_i32_f32_e32 v1, v2 ; encoding: [0x02,0x67,0x02,0x7e]
+v_frexp_exp_i32_f32 v1, v2
+
+// SICI: v_frexp_mant_f32_e32 v1, v2 ; encoding: [0x02,0x81,0x02,0x7e]
+// VI:   v_frexp_mant_f32_e32 v1, v2 ; encoding: [0x02,0x69,0x02,0x7e]
+v_frexp_mant_f32 v1, v2
+
+// SICI: v_clrexcp ; encoding: [0x00,0x82,0x00,0x7e]
+// VI:   v_clrexcp ; encoding: [0x00,0x6a,0x00,0x7e]
+v_clrexcp
+
+// SICI: v_movreld_b32_e32 v1, v2 ; encoding: [0x02,0x85,0x02,0x7e]
+// VI:   v_movreld_b32_e32 v1, v2 ; encoding: [0x02,0x6d,0x02,0x7e]
+v_movreld_b32 v1, v2
+
+// SICI: v_movrels_b32_e32 v1, v2 ; encoding: [0x02,0x87,0x02,0x7e]
+// VI:   v_movrels_b32_e32 v1, v2 ; encoding: [0x02,0x6f,0x02,0x7e]
+v_movrels_b32 v1, v2
+
+// SICI: v_movrelsd_b32_e32 v1, v2 ; encoding: [0x02,0x89,0x02,0x7e]
+// VI:   v_movrelsd_b32_e32 v1, v2 ; encoding: [0x02,0x71,0x02,0x7e]
+v_movrelsd_b32 v1, v2
+
+// NOSI: error: instruction not supported on this GPU
+// NOSI: v_log_legacy_f32 v1, v2
+// CI: v_log_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x8b,0x02,0x7e]
+// VI: v_log_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x99,0x02,0x7e]
+v_log_legacy_f32 v1, v2
+
+// NOSI: error: instruction not supported on this GPU
+// NOSI: v_exp_legacy_f32 v1, v2
+// CI: v_exp_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x8d,0x02,0x7e]
+// VI: v_exp_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x97,0x02,0x7e]
+v_exp_legacy_f32 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_cvt_f16_u16 v1, v2
+// VI: v_cvt_f16_u16_e32 v1, v2 ; encoding: [0x02,0x73,0x02,0x7e]
+v_cvt_f16_u16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_cvt_f16_i16 v1, v2
+// VI: v_cvt_f16_i16_e32 v1, v2 ; encoding: [0x02,0x75,0x02,0x7e]
+v_cvt_f16_i16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_cvt_u16_f16 v1, v2
+// VI: v_cvt_u16_f16_e32 v1, v2 ; encoding: [0x02,0x77,0x02,0x7e]
+v_cvt_u16_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_cvt_i16_f16 v1, v2
+// VI: v_cvt_i16_f16_e32 v1, v2 ; encoding: [0x02,0x79,0x02,0x7e]
+v_cvt_i16_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_rcp_f16 v1, v2
+// VI: v_rcp_f16_e32 v1, v2 ; encoding: [0x02,0x7b,0x02,0x7e]
+v_rcp_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_sqrt_f16 v1, v2
+// VI: v_sqrt_f16_e32 v1, v2 ; encoding: [0x02,0x7d,0x02,0x7e]
+v_sqrt_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_rsq_f16 v1, v2
+// VI: v_rsq_f16_e32 v1, v2 ; encoding: [0x02,0x7f,0x02,0x7e]
+v_rsq_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_log_f16 v1, v2
+// VI: v_log_f16_e32 v1, v2 ; encoding: [0x02,0x81,0x02,0x7e]
+v_log_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_exp_f16 v1, v2
+// VI: v_exp_f16_e32 v1, v2 ; encoding: [0x02,0x83,0x02,0x7e]
+v_exp_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_frexp_mant_f16 v1, v2
+// VI: v_frexp_mant_f16_e32 v1, v2 ; encoding: [0x02,0x85,0x02,0x7e]
+v_frexp_mant_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_frexp_exp_i16_f16 v1, v2
+// VI: v_frexp_exp_i16_f16_e32 v1, v2 ; encoding: [0x02,0x87,0x02,0x7e]
+v_frexp_exp_i16_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_floor_f16 v1, v2
+// VI: v_floor_f16_e32 v1, v2 ; encoding: [0x02,0x89,0x02,0x7e]
+v_floor_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_ceil_f16 v1, v2
+// VI: v_ceil_f16_e32 v1, v2 ; encoding: [0x02,0x8b,0x02,0x7e]
+v_ceil_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_trunc_f16 v1, v2
+// VI: v_trunc_f16_e32 v1, v2 ; encoding: [0x02,0x8d,0x02,0x7e]
+v_trunc_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_rndne_f16 v1, v2
+// VI: v_rndne_f16_e32 v1, v2 ; encoding: [0x02,0x8f,0x02,0x7e]
+v_rndne_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_fract_f16 v1, v2
+// VI: v_fract_f16_e32 v1, v2 ; encoding: [0x02,0x91,0x02,0x7e]
+v_fract_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_sin_f16 v1, v2
+// VI: v_sin_f16_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
+v_sin_f16 v1, v2
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_cos_f16 v1, v2
+// VI: v_cos_f16_e32 v1, v2 ; encoding: [0x02,0x95,0x02,0x7e]
+v_cos_f16 v1, v2
diff --git a/test/MC/AMDGPU/vop2-err.s b/test/MC/AMDGPU/vop2-err.s
new file mode 100644
index 00000000000..a1131000a90
--- /dev/null
+++ b/test/MC/AMDGPU/vop2-err.s
@@ -0,0 +1,35 @@
+// RUN: not llvm-mc -arch=amdgcn %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=SI %s 2>&1 | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Generic checks
+//===----------------------------------------------------------------------===//
+
+v_mul_i32_i24 v1, v2, 100
+// CHECK: error: invalid operand for instruction
+
+//===----------------------------------------------------------------------===//
+// _e32 checks
+//===----------------------------------------------------------------------===//
+
+// Immediate src1
+v_mul_i32_i24_e32 v1, v2, 100
+// CHECK: error: invalid operand for instruction
+
+// sgpr src1
+v_mul_i32_i24_e32 v1, v2, s3
+// CHECK: error: invalid operand for instruction
+
+//===----------------------------------------------------------------------===//
+// _e64 checks
+//===----------------------------------------------------------------------===//
+
+// Immediate src0
+v_mul_i32_i24_e64 v1, 100, v3
+// CHECK: error: invalid operand for instruction
+
+// Immediate src1
+v_mul_i32_i24_e64 v1, v2, 100
+// CHECK: error: invalid operand for instruction
+
+// TODO: Constant bus restrictions
diff --git a/test/MC/AMDGPU/vop2.s b/test/MC/AMDGPU/vop2.s
new file mode 100644
index 00000000000..a1f3b8d8936
--- /dev/null
+++ b/test/MC/AMDGPU/vop2.s
@@ -0,0 +1,421 @@
+// RUN: not llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI
+
+// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s -check-prefix=NOVI
+
+//===----------------------------------------------------------------------===//
+// Generic Checks for floating-point instructions (These have modifiers).
+//===----------------------------------------------------------------------===//
+
+// TODO: 64-bit encoding of instructions with modifiers
+
+// _e32 suffix
+// SICI: v_add_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x06]
+v_add_f32_e32 v1, v2, v3
+
+// src0 inline immediate
+// SICI: v_add_f32_e32 v1, 1.0, v3 ; encoding: [0xf2,0x06,0x02,0x06]
+v_add_f32 v1, 1.0, v3
+
+// src0 negative inline immediate
+// SICI: v_add_f32_e32 v1, -1.0, v3 ; encoding: [0xf3,0x06,0x02,0x06]
+v_add_f32 v1, -1.0, v3
+
+// src0 literal
+// SICI: v_add_f32_e32 v1, 0x42c80000, v3 ; encoding: [0xff,0x06,0x02,0x06,0x00,0x00,0xc8,0x42]
+v_add_f32 v1, 100.0, v3
+
+// src0 negative literal
+// SICI: v_add_f32_e32 v1, 0xc2c80000, v3 ; encoding: [0xff,0x06,0x02,0x06,0x00,0x00,0xc8,0xc2]
+v_add_f32 v1, -100.0, v3
+
+//===----------------------------------------------------------------------===//
+// Generic Checks for integer instructions (These don't have modifiers).
+//===----------------------------------------------------------------------===//
+
+// _e32 suffix
+// SICI: v_mul_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x12]
+v_mul_i32_i24_e32 v1, v2, v3
+
+// _e64 suffix
+// SICI: v_mul_i32_i24_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x07,0x02,0x00]
+v_mul_i32_i24_e64 v1, v2, v3
+
+// src0 inline
+// SICI: v_mul_i32_i24_e32 v1, 3, v3 ; encoding: [0x83,0x06,0x02,0x12]
+v_mul_i32_i24 v1, 3, v3
+
+// src0 negative inline
+// SICI: v_mul_i32_i24_e32 v1, -3, v3 ; encoding: [0xc3,0x06,0x02,0x12]
+v_mul_i32_i24 v1, -3, v3
+
+// src1 inline
+// SICI: v_mul_i32_i24_e64 v1, v2, 3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x07,0x01,0x00]
+v_mul_i32_i24 v1, v2, 3
+
+// src1 negative inline
+// SICI: v_mul_i32_i24_e64 v1, v2, -3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x87,0x01,0x00]
+v_mul_i32_i24 v1, v2, -3
+
+// src0 literal
+// SICI: v_mul_i32_i24_e32 v1, 0x64, v3 ; encoding: [0xff,0x06,0x02,0x12,0x64,0x00,0x00,0x00]
+v_mul_i32_i24 v1, 100, v3
+
+// src1 negative literal
+// SICI: v_mul_i32_i24_e32 v1, 0xffffff9c, v3 ; encoding: [0xff,0x06,0x02,0x12,0x9c,0xff,0xff,0xff]
+v_mul_i32_i24 v1, -100, v3
+
+//===----------------------------------------------------------------------===//
+// Checks for legal operands
+//===----------------------------------------------------------------------===//
+
+// src0 sgpr
+// SICI: v_mul_i32_i24_e32 v1, s2, v3 ; encoding: [0x02,0x06,0x02,0x12]
+v_mul_i32_i24 v1, s2, v3
+
+// src1 sgpr
+// SICI: v_mul_i32_i24_e64 v1, v2, s3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x07,0x00,0x00]
+v_mul_i32_i24 v1, v2, s3
+
+// src0, src1 same sgpr
+// SICI: v_mul_i32_i24_e64 v1, s2, s2 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x04,0x00,0x00]
+v_mul_i32_i24 v1, s2, s2
+
+// src0 sgpr, src1 inline
+// SICI: v_mul_i32_i24_e64 v1, s2, 3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x06,0x01,0x00]
+v_mul_i32_i24 v1, s2, 3
+
+// src0 inline src1 sgpr
+// SICI: v_mul_i32_i24_e64 v1, 3, s3 ; encoding: [0x01,0x00,0x12,0xd2,0x83,0x06,0x00,0x00]
+v_mul_i32_i24 v1, 3, s3
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// GCN: v_cndmask_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x00]
+v_cndmask_b32 v1, v2, v3
+
+// SICI: v_readlane_b32 s1, v2, s3 ; encoding: [0x02,0x07,0x02,0x02]
+// VI:   v_readlane_b32 s1, v2, s3 ; encoding: [0x01,0x00,0x89,0xd2,0x02,0x07,0x00,0x00]
+v_readlane_b32 s1, v2, s3
+
+// SICI: v_writelane_b32 v1, s2, s3 ; encoding: [0x02,0x06,0x02,0x04]
+// VI:   v_writelane_b32 v1, s2, s3 ; encoding: [0x01,0x00,0x8a,0xd2,0x02,0x06,0x00,0x00]
+v_writelane_b32 v1, s2, s3
+
+// SICI: v_add_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x06]
+// VI:   v_add_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x02]
+v_add_f32 v1, v2, v3
+
+// SICI: v_sub_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x08]
+// VI:   v_sub_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x04]
+v_sub_f32 v1, v2, v3
+
+// SICI: v_subrev_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0a]
+// VI:   v_subrev_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x06]
+v_subrev_f32 v1, v2, v3
+
+// SICI: v_mac_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0c]
+// NOVI: error: instruction not supported on this GPU
+// NOVI: v_mac_legacy_f32 v1, v2, v3
+v_mac_legacy_f32 v1, v2, v3
+
+// SICI: v_mul_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0e]
+// VI:   v_mul_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x08]
+v_mul_legacy_f32_e32 v1, v2, v3
+
+// SICI: v_mul_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x10]
+// VI:   v_mul_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0a]
+v_mul_f32 v1, v2, v3
+
+// SICI: v_mul_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x12]
+// VI:   v_mul_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0c]
+v_mul_i32_i24 v1, v2, v3
+
+// SICI: v_mul_hi_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x14]
+// VI:   v_mul_hi_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0e]
+v_mul_hi_i32_i24 v1, v2, v3
+
+// SICI: v_mul_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x16]
+// VI:   v_mul_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x10]
+v_mul_u32_u24 v1, v2, v3
+
+// SICI: v_mul_hi_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x18]
+// VI:   v_mul_hi_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x12]
+v_mul_hi_u32_u24 v1, v2, v3
+
+// SICI: v_min_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1a]
+// NOVI: error: instruction not supported on this GPU
+// NOVI: v_min_legacy_f32_e32 v1, v2, v3
+v_min_legacy_f32_e32 v1, v2, v3
+
+// SICI: v_max_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1c]
+// NOVI: error: instruction not supported on this GPU
+// NOVI: v_max_legacy_f32 v1, v2, v3
+v_max_legacy_f32 v1, v2, v3
+
+// SICI: v_min_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1e]
+// VI:   v_min_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x14]
+v_min_f32_e32 v1, v2, v3
+
+// SICI: v_max_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x20]
+// VI:   v_max_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x16]
+v_max_f32 v1, v2 v3
+
+// SICI: v_min_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x22]
+// VI:   v_min_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x18]
+v_min_i32 v1, v2, v3
+
+// SICI: v_max_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x24]
+// VI:   v_max_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1a]
+v_max_i32 v1, v2, v3
+
+// SICI: v_min_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x26]
+// VI:   v_min_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1c]
+v_min_u32 v1, v2, v3
+
+// SICI: v_max_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x28]
+// VI:   v_max_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1e]
+v_max_u32 v1, v2, v3
+
+// SICI: v_lshr_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2a]
+// NOVI: error: instruction not supported on this GPU
+// NOVI: v_lshr_b32 v1, v2, v3
+v_lshr_b32 v1, v2, v3
+
+// SICI: v_lshrrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2c]
+// VI:   v_lshrrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x20]
+v_lshrrev_b32 v1, v2, v3
+
+// SICI: v_ashr_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2e]
+// NOVI: error: instruction not supported on this GPU
+// NOVI: v_ashr_i32 v1, v2, v3
+v_ashr_i32 v1, v2, v3
+
+// SICI: v_ashrrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x30]
+// VI:   v_ashrrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x22]
+v_ashrrev_i32 v1, v2, v3
+
+// SICI: v_lshl_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x32]
+// NOVI: error: instruction not supported on this GPU
+// NOVI: v_lshl_b32_e32 v1, v2, v3
+v_lshl_b32_e32 v1, v2, v3
+
+// SICI: v_lshlrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x34]
+// VI:   v_lshlrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x24]
+v_lshlrev_b32 v1, v2, v3
+
+// SICI: v_and_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x36]
+// VI:   v_and_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x26]
+v_and_b32 v1, v2, v3
+
+// SICI: v_or_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x38]
+// VI:   v_or_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x28]
+v_or_b32 v1, v2, v3
+
+// SICI: v_xor_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3a]
+// VI:   v_xor_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2a]
+v_xor_b32 v1, v2, v3
+
+// SICI: v_bfm_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3c]
+// VI:   v_bfm_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
+v_bfm_b32 v1, v2, v3
+
+// SICI: v_mac_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3e]
+// VI:   v_mac_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2c]
+v_mac_f32 v1, v2, v3
+
+// SICI: v_madmk_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x40,0x00,0x00,0x80,0x42]
+// VI:   v_madmk_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x2e,0x00,0x00,0x80,0x42]
+v_madmk_f32 v1, v2, v3, 64.0
+
+// SICI: v_madak_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x42,0x00,0x00,0x80,0x42]
+// VI:  v_madak_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x30,0x00,0x00,0x80,0x42]
+v_madak_f32 v1, v2, v3, 64.0
+
+// SICI: v_bcnt_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x44]
+// VI:   v_bcnt_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00]
+v_bcnt_u32_b32 v1, v2, v3
+
+// SICI: v_mbcnt_lo_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x46]
+// VI:   v_mbcnt_lo_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00]
+v_mbcnt_lo_u32_b32 v1, v2, v3
+
+// SICI: v_mbcnt_hi_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x48]
+// VI:   v_mbcnt_hi_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00]
+v_mbcnt_hi_u32_b32 v1, v2, v3
+
+// SICI: v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4a]
+// VI:   v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x32]
+v_add_i32 v1, v2, v3
+
+// SICI: v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4a]
+// VI:   v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x32]
+v_add_u32 v1, v2, v3
+
+// SICI: v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c]
+// VI:   v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x34]
+v_sub_i32 v1, v2, v3
+
+// SICI: v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c]
+// VI:   v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x34]
+v_sub_u32 v1, v2, v3
+
+// SICI: v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4e]
+// VI:   v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x36]
+v_subrev_i32 v1, v2, v3
+
+// SICI: v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4e]
+// VI:   v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x36]
+v_subrev_u32 v1, v2, v3
+
+// SICI: v_addc_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x50]
+// VI:   v_addc_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x38]
+v_addc_u32 v1, v2, v3
+
+// SICI: v_subb_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x52]
+// VI:   v_subb_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3a]
+v_subb_u32 v1, v2, v3
+
+// SICI: v_subbrev_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x54]
+// VI:   v_subbrev_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3c]
+v_subbrev_u32 v1, v2, v3
+
+// SICI: v_ldexp_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x56]
+// VI:   v_ldexp_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x88,0xd2,0x02,0x07,0x02,0x00]
+v_ldexp_f32 v1, v2, v3
+
+// SICI: v_cvt_pkaccum_u8_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x58]
+// VI:   v_cvt_pkaccum_u8_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0xf0,0xd1,0x02,0x07,0x02,0x00]
+v_cvt_pkaccum_u8_f32 v1, v2, v3
+
+// SICI: v_cvt_pknorm_i16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5a]
+// VI:   v_cvt_pknorm_i16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x94,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_pknorm_i16_f32 v1, v2, v3
+
+// SICI: v_cvt_pknorm_u16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5c]
+// VI:   v_cvt_pknorm_u16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x95,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_pknorm_u16_f32 v1, v2, v3
+
+// SICI: v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5e]
+// VI:   v_cvt_pkrtz_f16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x96,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_pkrtz_f16_f32 v1, v2, v3
+
+// SICI: v_cvt_pk_u16_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x60]
+// VI:   v_cvt_pk_u16_u32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_pk_u16_u32 v1, v2, v3
+
+// SICI: v_cvt_pk_i16_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x62]
+// VI:   v_cvt_pk_i16_i32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_pk_i16_i32 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_add_f16 v1, v2, v3
+// VI:     v_add_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3e]
+v_add_f16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_sub_f16 v1, v2, v3
+// VI:     v_sub_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x40]
+v_sub_f16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_subrev_f16 v1, v2, v3
+// VI:     v_subrev_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x42]
+v_subrev_f16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_mul_f16 v1, v2, v3
+// VI:     v_mul_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x44]
+v_mul_f16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_mac_f16 v1, v2, v3
+// VI:     v_mac_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x46]
+v_mac_f16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_madmk_f16 v1, v2, v3, 64.0
+// VI:     v_madmk_f16_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x00,0x80,0x42]
+v_madmk_f16 v1, v2, v3, 64.0
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_madak_f16 v1, v2, v3, 64.0
+// VI:     v_madak_f16_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x00,0x80,0x42]
+v_madak_f16 v1, v2, v3, 64.0
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_add_u16 v1, v2, v3
+// VI:     v_add_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c]
+v_add_u16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_sub_u16 v1, v2, v3
+// VI:     v_sub_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4e]
+v_sub_u16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_subrev_u16 v1, v2, v3
+// VI:     v_subrev_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x50]
+v_subrev_u16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_mul_lo_u16 v1, v2, v3
+// VI:     v_mul_lo_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x52]
+v_mul_lo_u16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_lshlrev_b16 v1, v2, v3
+// VI:     v_lshlrev_b16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x54]
+v_lshlrev_b16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_lshrrev_b16 v1, v2, v3
+// VI: v_lshrrev_b16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x56]
+v_lshrrev_b16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_ashrrev_b16 v1, v2, v3
+// VI:     v_ashrrev_b16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x58]
+v_ashrrev_b16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_max_f16 v1, v2, v3
+// VI:     v_max_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5a]
+v_max_f16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_min_f16 v1, v2, v3
+// VI:     v_min_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5c]
+v_min_f16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_max_u16 v1, v2, v3
+// VI:     v_max_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5e]
+v_max_u16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_max_i16 v1, v2, v3
+// VI:     v_max_i16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x60]
+v_max_i16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_min_u16 v1, v2, v3
+// VI:     v_min_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x62]
+v_min_u16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_min_i16 v1, v2, v3
+// VI:     v_min_i16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x64]
+v_min_i16 v1, v2, v3
+
+// NOSICI: error: instruction not supported on this GPU
+// NOSICI: v_ldexp_f16 v1, v2, v3
+// VI:     v_ldexp_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x66]
+v_ldexp_f16 v1, v2, v3
diff --git a/test/MC/AMDGPU/vop3-errs.s b/test/MC/AMDGPU/vop3-errs.s
new file mode 100644
index 00000000000..b57fe6d5314
--- /dev/null
+++ b/test/MC/AMDGPU/vop3-errs.s
@@ -0,0 +1,5 @@
+// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s
+
+v_add_f32_e64 v0, v1
+// CHECK: error: too few operands for instruction
diff --git a/test/MC/AMDGPU/vop3.s b/test/MC/AMDGPU/vop3.s
new file mode 100644
index 00000000000..20562335974
--- /dev/null
+++ b/test/MC/AMDGPU/vop3.s
@@ -0,0 +1,149 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// VOPC Instructions
+//===----------------------------------------------------------------------===//
+
+// Test forced e64 encoding
+
+v_cmp_lt_f32_e64 s[2:3], v4, -v6
+// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, -v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x40]
+
+//
+// Modifier tests:
+//
+
+v_cmp_lt_f32 s[2:3] -v4, v6
+// CHECK: v_cmp_lt_f32_e64 s[2:3], -v4, v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x20] 
+
+v_cmp_lt_f32 s[2:3]  v4, -v6
+// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, -v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x40]
+
+v_cmp_lt_f32 s[2:3] -v4, -v6
+// CHECK: v_cmp_lt_f32_e64 s[2:3], -v4, -v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x60]
+
+v_cmp_lt_f32 s[2:3] |v4|, v6
+// CHECK: v_cmp_lt_f32_e64 s[2:3], |v4|, v6 ; encoding: [0x02,0x01,0x02,0xd0,0x04,0x0d,0x02,0x00]
+
+v_cmp_lt_f32 s[2:3] v4, |v6|
+// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, |v6| ; encoding: [0x02,0x02,0x02,0xd0,0x04,0x0d,0x02,0x00]
+
+v_cmp_lt_f32 s[2:3] |v4|, |v6|
+// CHECK: v_cmp_lt_f32_e64 s[2:3], |v4|, |v6| ; encoding: [0x02,0x03,0x02,0xd0,0x04,0x0d,0x02,0x00]
+
+v_cmp_lt_f32 s[2:3] -|v4|, v6
+// CHECK: v_cmp_lt_f32_e64 s[2:3], -|v4|, v6 ; encoding: [0x02,0x01,0x02,0xd0,0x04,0x0d,0x02,0x20]
+
+v_cmp_lt_f32 s[2:3] v4, -|v6|
+// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, -|v6| ; encoding: [0x02,0x02,0x02,0xd0,0x04,0x0d,0x02,0x40]
+
+v_cmp_lt_f32 s[2:3] -|v4|, -|v6|
+// CHECK: v_cmp_lt_f32_e64 s[2:3], -|v4|, -|v6| ; encoding: [0x02,0x03,0x02,0xd0,0x04,0x0d,0x02,0x60]
+
+//
+// Instruction tests:
+//
+
+v_cmp_f_f32 s[2:3], v4, v6
+// CHECK: v_cmp_f_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x00,0xd0,0x04,0x0d,0x02,0x00]
+
+v_cmp_lt_f32 s[2:3], v4, v6
+// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x00]
+
+v_cmp_eq_f32 s[2:3], v4, v6
+// CHECK: v_cmp_eq_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x04,0xd0,0x04,0x0d,0x02,0x00]
+
+v_cmp_le_f32 s[2:3], v4, v6
+// CHECK: v_cmp_le_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x06,0xd0,0x04,0x0d,0x02,0x00]
+
+v_cmp_gt_f32 s[2:3], v4, v6
+// CHECK: v_cmp_gt_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x08,0xd0,0x04,0x0d,0x02,0x00]
+
+v_cmp_lg_f32 s[2:3], v4, v6
+// CHECK: v_cmp_lg_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x0a,0xd0,0x04,0x0d,0x02,0x00]
+
+v_cmp_ge_f32 s[2:3], v4, v6
+// CHECK: v_cmp_ge_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x0c,0xd0,0x04,0x0d,0x02,0x00]
+
+// TODO: Finish VOPC
+
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+//
+// Modifier tests:
+// 
+
+v_fract_f32 v1, -v2
+// CHECK: v_fract_f32_e64 v1, -v2 ; encoding: [0x01,0x00,0x40,0xd3,0x02,0x01,0x00,0x20]
+
+v_fract_f32 v1, |v2|
+// CHECK: v_fract_f32_e64 v1, |v2| ; encoding: [0x01,0x01,0x40,0xd3,0x02,0x01,0x00,0x00]
+
+v_fract_f32 v1, -|v2|
+// CHECK: v_fract_f32_e64 v1, -|v2| ; encoding: [0x01,0x01,0x40,0xd3,0x02,0x01,0x00,0x20]
+
+v_fract_f32 v1, v2 clamp
+// CHECK: v_fract_f32_e64 v1, v2 clamp ; encoding: [0x01,0x08,0x40,0xd3,0x02,0x01,0x00,0x00]
+
+v_fract_f32 v1, v2 mul:2
+// CHECK: v_fract_f32_e64 v1, v2 mul:2 ; encoding: [0x01,0x00,0x40,0xd3,0x02,0x01,0x00,0x08]
+
+v_fract_f32 v1, v2, div:2 clamp
+// CHECK: v_fract_f32_e64 v1, v2 clamp div:2 ; encoding: [0x01,0x08,0x40,0xd3,0x02,0x01,0x00,0x18]
+
+// TODO: Finish VOP1
+
+///===---------------------------------------------------------------------===//
+// VOP2 Instructions
+///===---------------------------------------------------------------------===//
+
+// Test forced e64 encoding with e32 operands
+
+v_ldexp_f32_e64 v1, v3, v5
+// CHECK: v_ldexp_f32_e64 v1, v3, v5 ; encoding: [0x01,0x00,0x56,0xd2,0x03,0x0b,0x02,0x00]
+
+
+// TODO: Modifier tests
+
+v_cndmask_b32 v1, v3, v5, s[4:5]
+// CHECK: v_cndmask_b32_e64 v1, v3, v5, s[4:5] ; encoding: [0x01,0x00,0x00,0xd2,0x03,0x0b,0x12,0x00]
+
+//TODO: readlane, writelane
+
+v_add_f32 v1, v3, s5
+// CHECK: v_add_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x06,0xd2,0x03,0x0b,0x00,0x00]
+
+v_sub_f32 v1, v3, s5
+// CHECK: v_sub_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x08,0xd2,0x03,0x0b,0x00,0x00]
+
+v_subrev_f32 v1, v3, s5
+// CHECK: v_subrev_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x0a,0xd2,0x03,0x0b,0x00,0x00]
+
+v_mac_legacy_f32 v1, v3, s5
+// CHECK: v_mac_legacy_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x0c,0xd2,0x03,0x0b,0x00,0x00]
+
+v_mul_legacy_f32 v1, v3, s5
+// CHECK: v_mul_legacy_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x0e,0xd2,0x03,0x0b,0x00,0x00]
+
+v_mul_f32 v1, v3, s5
+// CHECK: v_mul_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x10,0xd2,0x03,0x0b,0x00,0x00]
+
+v_mul_i32_i24 v1, v3, s5
+// CHECK: v_mul_i32_i24_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x12,0xd2,0x03,0x0b,0x00,0x00]
+
+///===---------------------------------------------------------------------===//
+// VOP3 Instructions
+///===---------------------------------------------------------------------===//
+
+// TODO: Modifier tests
+
+v_mad_legacy_f32 v2, v4, v6, v8
+// CHECK: v_mad_legacy_f32 v2, v4, v6, v8 ; encoding: [0x02,0x00,0x80,0xd2,0x04,0x0d,0x22,0x04]
+
+
+
+
+
diff --git a/test/MC/AMDGPU/vopc.s b/test/MC/AMDGPU/vopc.s
new file mode 100644
index 00000000000..f44919a4f1e
--- /dev/null
+++ b/test/MC/AMDGPU/vopc.s
@@ -0,0 +1,40 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Generic Checks
+//===----------------------------------------------------------------------===//
+
+// src0 sgpr
+v_cmp_lt_f32 vcc, s2, v4
+// CHECK: v_cmp_lt_f32_e32 vcc, s2, v4 ; encoding: [0x02,0x08,0x02,0x7c]
+
+// src0 inline immediate
+v_cmp_lt_f32 vcc, 0, v4
+// CHECK: v_cmp_lt_f32_e32 vcc, 0, v4 ; encoding: [0x80,0x08,0x02,0x7c]
+
+// src0 literal
+v_cmp_lt_f32 vcc, 10.0, v4
+// CHECK: v_cmp_lt_f32_e32 vcc, 0x41200000, v4 ; encoding: [0xff,0x08,0x02,0x7c,0x00,0x00,0x20,0x41]
+
+// src0, src1 max vgpr
+v_cmp_lt_f32 vcc, v255, v255
+// CHECK: v_cmp_lt_f32_e32 vcc, v255, v255 ; encoding: [0xff,0xff,0x03,0x7c]
+
+// force 32-bit encoding
+v_cmp_lt_f32_e32 vcc, v2, v4
+// CHECK: v_cmp_lt_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x02,0x7c]
+
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+v_cmp_f_f32 vcc, v2, v4
+// CHECK: v_cmp_f_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x00,0x7c]
+
+v_cmp_lt_f32 vcc, v2, v4
+// CHECK: v_cmp_lt_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x02,0x7c]
+
+// TODO: Add tests for the rest of the instructions.
+
diff --git a/test/MC/R600/ds-err.s b/test/MC/R600/ds-err.s
deleted file mode 100644
index 52c2740bec2..00000000000
--- a/test/MC/R600/ds-err.s
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: not llvm-mc -arch=amdgcn %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -arch=amdgcn -mcpu=SI %s 2>&1 | FileCheck %s
-
-// offset too big
-// CHECK: invalid operand for instruction
-ds_add_u32 v2, v4 offset:1000000000
-
-// offset0 twice
-// CHECK:  error: not a valid operand.
-ds_write2_b32 v2, v4, v6 offset0:4 offset0:8
-
-// offset1 twice
-// CHECK:  error: not a valid operand.
-ds_write2_b32 v2, v4, v6 offset1:4 offset1:8
-
-// offset0 too big
-// CHECK: invalid operand for instruction
-ds_write2_b32 v2, v4, v6 offset0:1000000000
-
-// offset1 too big
-// CHECK: invalid operand for instruction
-ds_write2_b32 v2, v4, v6 offset1:1000000000
-
diff --git a/test/MC/R600/ds.s b/test/MC/R600/ds.s
deleted file mode 100644
index ad63229ba2e..00000000000
--- a/test/MC/R600/ds.s
+++ /dev/null
@@ -1,337 +0,0 @@
-// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
-// RUN: llvm-mc -arch=amdgcn -mcpu=SI  -show-encoding %s | FileCheck %s
-
-//===----------------------------------------------------------------------===//
-// Checks for 16-bit Offsets
-//===----------------------------------------------------------------------===//
-
-ds_add_u32 v2, v4 offset:16
-// CHECK: ds_add_u32 v2, v4 offset:16 ; encoding: [0x10,0x00,0x00,0xd8,0x02,0x04,0x00,0x00]
-
-//===----------------------------------------------------------------------===//
-// Checks for 2 8-bit Offsets
-//===----------------------------------------------------------------------===//
-
-ds_write2_b32 v2, v4, v6 offset0:4
-// CHECK: ds_write2_b32 v2, v4, v6 offset0:4 ; encoding: [0x04,0x00,0x38,0xd8,0x02,0x04,0x06,0x00]
-
-ds_write2_b32 v2, v4, v6 offset0:4 offset1:8
-// CHECK: ds_write2_b32 v2, v4, v6 offset0:4 offset1:8 ; encoding: [0x04,0x08,0x38,0xd8,0x02,0x04,0x06,0x00]
-
-ds_write2_b32 v2, v4, v6 offset1:8
-// CHECK: ds_write2_b32 v2, v4, v6 offset1:8 ; encoding: [0x00,0x08,0x38,0xd8,0x02,0x04,0x06,0x00]
-
-ds_read2_b32 v[8:9], v2 offset0:4
-// CHECK: ds_read2_b32 v[8:9], v2 offset0:4 ; encoding: [0x04,0x00,0xdc,0xd8,0x02,0x00,0x00,0x08]
-
-ds_read2_b32 v[8:9], v2 offset0:4 offset1:8
-// CHECK: ds_read2_b32 v[8:9], v2 offset0:4 offset1:8 ; encoding: [0x04,0x08,0xdc,0xd8,0x02,0x00,0x00,0x08]
-
-ds_read2_b32 v[8:9], v2 offset1:8
-// CHECK: ds_read2_b32 v[8:9], v2 offset1:8 ; encoding: [0x00,0x08,0xdc,0xd8,0x02,0x00,0x00,0x08]
-//===----------------------------------------------------------------------===//
-// Instructions
-//===----------------------------------------------------------------------===//
-
-ds_add_u32 v2, v4
-// CHECK: ds_add_u32 v2, v4 ; encoding: [0x00,0x00,0x00,0xd8,0x02,0x04,0x00,0x00]
-
-ds_sub_u32 v2, v4
-// CHECK: ds_sub_u32 v2, v4 ; encoding: [0x00,0x00,0x04,0xd8,0x02,0x04,0x00,0x00]
-
-ds_rsub_u32 v2, v4
-// CHECK: ds_rsub_u32 v2, v4 ; encoding: [0x00,0x00,0x08,0xd8,0x02,0x04,0x00,0x00]
-
-ds_inc_u32 v2, v4
-// CHECK: ds_inc_u32 v2, v4 ; encoding: [0x00,0x00,0x0c,0xd8,0x02,0x04,0x00,0x00]
-
-ds_dec_u32 v2, v4
-// CHECK: ds_dec_u32 v2, v4 ; encoding: [0x00,0x00,0x10,0xd8,0x02,0x04,0x00,0x00]
-
-ds_min_i32 v2, v4
-// CHECK: ds_min_i32 v2, v4 ; encoding: [0x00,0x00,0x14,0xd8,0x02,0x04,0x00,0x00]
-
-ds_max_i32 v2, v4
-// CHECK: ds_max_i32 v2, v4 ; encoding: [0x00,0x00,0x18,0xd8,0x02,0x04,0x00,0x00]
-
-ds_min_u32 v2, v4
-// CHECK: ds_min_u32 v2, v4 ; encoding: [0x00,0x00,0x1c,0xd8,0x02,0x04,0x00,0x00]
-
-ds_max_u32 v2, v4
-// CHECK: ds_max_u32 v2, v4 ; encoding: [0x00,0x00,0x20,0xd8,0x02,0x04,0x00,0x00]
-
-ds_and_b32 v2, v4
-// CHECK: ds_and_b32 v2, v4 ; encoding: [0x00,0x00,0x24,0xd8,0x02,0x04,0x00,0x00]
-
-ds_or_b32 v2, v4
-// CHECK: ds_or_b32 v2, v4 ; encoding: [0x00,0x00,0x28,0xd8,0x02,0x04,0x00,0x00]
-
-ds_xor_b32 v2, v4
-// CHECK: ds_xor_b32 v2, v4 ; encoding: [0x00,0x00,0x2c,0xd8,0x02,0x04,0x00,0x00]
-
-ds_mskor_b32 v2, v4, v6
-// CHECK: ds_mskor_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x30,0xd8,0x02,0x04,0x06,0x00]
-
-ds_write_b32 v2, v4
-// CHECK: ds_write_b32 v2, v4 ; encoding: [0x00,0x00,0x34,0xd8,0x02,0x04,0x00,0x00]
-
-ds_write2_b32 v2, v4, v6
-// CHECK: ds_write2_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x38,0xd8,0x02,0x04,0x06,0x00]
-
-ds_write2st64_b32 v2, v4, v6
-// CHECK: ds_write2st64_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x3c,0xd8,0x02,0x04,0x06,0x00]
-
-ds_cmpst_b32 v2, v4, v6
-// CHECK: ds_cmpst_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x40,0xd8,0x02,0x04,0x06,0x00]
-
-ds_cmpst_f32 v2, v4, v6
-// CHECK: ds_cmpst_f32 v2, v4, v6 ; encoding: [0x00,0x00,0x44,0xd8,0x02,0x04,0x06,0x00]
-
-ds_min_f32 v2, v4, v6
-// CHECK: ds_min_f32 v2, v4, v6 ; encoding: [0x00,0x00,0x48,0xd8,0x02,0x04,0x06,0x00]
-
-ds_max_f32 v2, v4, v6
-// CHECK: ds_max_f32 v2, v4, v6 ; encoding: [0x00,0x00,0x4c,0xd8,0x02,0x04,0x06,0x00]
-
-ds_gws_init v2 gds
-// CHECK: ds_gws_init v2 gds ; encoding: [0x00,0x00,0x66,0xd8,0x02,0x00,0x00,0x00]
-
-ds_gws_sema_v v2 gds
-// CHECK: ds_gws_sema_v v2 gds ; encoding: [0x00,0x00,0x6a,0xd8,0x02,0x00,0x00,0x00]
-
-ds_gws_sema_br v2 gds
-// CHECK: ds_gws_sema_br v2 gds ; encoding: [0x00,0x00,0x6e,0xd8,0x02,0x00,0x00,0x00]
-
-ds_gws_sema_p v2 gds
-// CHECK: ds_gws_sema_p v2 gds ; encoding: [0x00,0x00,0x72,0xd8,0x02,0x00,0x00,0x00]
-
-ds_gws_barrier v2 gds
-// CHECK: ds_gws_barrier v2 gds ; encoding: [0x00,0x00,0x76,0xd8,0x02,0x00,0x00,0x00]
-
-ds_write_b8 v2, v4
-// CHECK: ds_write_b8 v2, v4 ; encoding: [0x00,0x00,0x78,0xd8,0x02,0x04,0x00,0x00]
-
-ds_write_b16 v2, v4
-// CHECK: ds_write_b16 v2, v4 ; encoding: [0x00,0x00,0x7c,0xd8,0x02,0x04,0x00,0x00]
-
-ds_add_rtn_u32 v8, v2, v4
-// CHECK: ds_add_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x80,0xd8,0x02,0x04,0x00,0x08]
-
-ds_sub_rtn_u32 v8, v2, v4
-// CHECK: ds_sub_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x84,0xd8,0x02,0x04,0x00,0x08]
-
-ds_rsub_rtn_u32 v8, v2, v4
-// CHECK: ds_rsub_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x88,0xd8,0x02,0x04,0x00,0x08]
-
-ds_inc_rtn_u32 v8, v2, v4
-// CHECK: ds_inc_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x8c,0xd8,0x02,0x04,0x00,0x08]
-
-ds_dec_rtn_u32 v8, v2, v4
-// CHECK: ds_dec_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x90,0xd8,0x02,0x04,0x00,0x08]
-
-ds_min_rtn_i32 v8, v2, v4
-// CHECK: ds_min_rtn_i32 v8, v2, v4 ; encoding: [0x00,0x00,0x94,0xd8,0x02,0x04,0x00,0x08]
-
-ds_max_rtn_i32 v8, v2, v4
-// CHECK: ds_max_rtn_i32 v8, v2, v4 ; encoding: [0x00,0x00,0x98,0xd8,0x02,0x04,0x00,0x08]
-
-ds_min_rtn_u32 v8, v2, v4
-// CHECK: ds_min_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x9c,0xd8,0x02,0x04,0x00,0x08]
-
-ds_max_rtn_u32 v8, v2, v4
-// CHECK: ds_max_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0xa0,0xd8,0x02,0x04,0x00,0x08]
-
-ds_and_rtn_b32 v8, v2, v4
-// CHECK: ds_and_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xa4,0xd8,0x02,0x04,0x00,0x08]
-
-ds_or_rtn_b32 v8, v2, v4
-// CHECK: ds_or_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xa8,0xd8,0x02,0x04,0x00,0x08]
-
-ds_xor_rtn_b32 v8, v2, v4
-// CHECK: ds_xor_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xac,0xd8,0x02,0x04,0x00,0x08]
-
-ds_mskor_rtn_b32 v8, v2, v4, v6
-// CHECK: ds_mskor_rtn_b32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xb0,0xd8,0x02,0x04,0x06,0x08]
-
-ds_wrxchg_rtn_b32 v8, v2, v4
-// CHECK: ds_wrxchg_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xb4,0xd8,0x02,0x04,0x00,0x08]
-
-ds_wrxchg2_rtn_b32 v[8:9], v2, v4, v6
-// CHECK: ds_wrxchg2_rtn_b32 v[8:9], v2, v4, v6 ; encoding: [0x00,0x00,0xb8,0xd8,0x02,0x04,0x06,0x08]
-
-ds_wrxchg2st64_rtn_b32 v[8:9] v2, v4, v6
-// CHECK: ds_wrxchg2st64_rtn_b32 v[8:9], v2, v4, v6 ; encoding: [0x00,0x00,0xbc,0xd8,0x02,0x04,0x06,0x08]
-
-ds_cmpst_rtn_b32 v8, v2, v4, v6
-// CHECK: ds_cmpst_rtn_b32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xc0,0xd8,0x02,0x04,0x06,0x08]
-
-ds_cmpst_rtn_f32 v8, v2, v4, v6
-// CHECK: ds_cmpst_rtn_f32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xc4,0xd8,0x02,0x04,0x06,0x08]
-
-ds_min_rtn_f32 v8, v2, v4, v6
-// CHECK: ds_min_rtn_f32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xc8,0xd8,0x02,0x04,0x06,0x08]
-
-ds_max_rtn_f32 v8, v2, v4, v6
-// CHECK: ds_max_rtn_f32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xcc,0xd8,0x02,0x04,0x06,0x08]
-
-ds_swizzle_b32 v8, v2
-// CHECK: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
-
-ds_read_b32 v8, v2
-// CHECK: ds_read_b32 v8, v2 ; encoding: [0x00,0x00,0xd8,0xd8,0x02,0x00,0x00,0x08]
-
-ds_read2_b32 v[8:9], v2
-// CHECK: ds_read2_b32 v[8:9], v2 ; encoding: [0x00,0x00,0xdc,0xd8,0x02,0x00,0x00,0x08]
-
-ds_read2st64_b32 v[8:9], v2
-// CHECK: ds_read2st64_b32 v[8:9], v2 ; encoding: [0x00,0x00,0xe0,0xd8,0x02,0x00,0x00,0x08]
-
-ds_read_i8 v8, v2
-// CHECK: ds_read_i8 v8, v2 ; encoding: [0x00,0x00,0xe4,0xd8,0x02,0x00,0x00,0x08]
-
-ds_read_u8 v8, v2
-// CHECK: ds_read_u8 v8, v2 ; encoding: [0x00,0x00,0xe8,0xd8,0x02,0x00,0x00,0x08]
-
-ds_read_i16 v8, v2
-// CHECK: ds_read_i16 v8, v2 ; encoding: [0x00,0x00,0xec,0xd8,0x02,0x00,0x00,0x08]
-
-ds_read_u16 v8, v2
-// CHECK: ds_read_u16 v8, v2 ; encoding: [0x00,0x00,0xf0,0xd8,0x02,0x00,0x00,0x08]
-
-ds_consume v8
-// CHECK: ds_consume v8 ; encoding: [0x00,0x00,0xf4,0xd8,0x00,0x00,0x00,0x08]
-
-ds_append v8
-// CHECK: ds_append v8 ; encoding: [0x00,0x00,0xf8,0xd8,0x00,0x00,0x00,0x08]
-
-ds_ordered_count v8, v2 gds
-// CHECK: ds_ordered_count v8, v2 gds ; encoding: [0x00,0x00,0xfe,0xd8,0x02,0x00,0x00,0x08]
-
-ds_add_u64 v2, v[4:5]
-// CHECK: ds_add_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x00,0xd9,0x02,0x04,0x00,0x00]
-
-ds_sub_u64 v2, v[4:5]
-// CHECK: ds_sub_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x04,0xd9,0x02,0x04,0x00,0x00]
-
-ds_rsub_u64 v2, v[4:5]
-// CHECK: ds_rsub_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x08,0xd9,0x02,0x04,0x00,0x00]
-
-ds_inc_u64 v2, v[4:5]
-// CHECK: ds_inc_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x0c,0xd9,0x02,0x04,0x00,0x00]
-
-ds_dec_u64 v2, v[4:5]
-// CHECK: ds_dec_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x10,0xd9,0x02,0x04,0x00,0x00]
-
-ds_min_i64 v2, v[4:5]
-// CHECK: ds_min_i64 v2, v[4:5] ; encoding: [0x00,0x00,0x14,0xd9,0x02,0x04,0x00,0x00]
-
-ds_max_i64 v2, v[4:5]
-// CHECK: ds_max_i64 v2, v[4:5] ; encoding: [0x00,0x00,0x18,0xd9,0x02,0x04,0x00,0x00]
-
-ds_min_u64 v2, v[4:5]
-// CHECK: ds_min_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x1c,0xd9,0x02,0x04,0x00,0x00]
-
-ds_max_u64 v2, v[4:5]
-// CHECK: ds_max_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x20,0xd9,0x02,0x04,0x00,0x00]
-
-ds_and_b64 v2, v[4:5]
-// CHECK: ds_and_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x24,0xd9,0x02,0x04,0x00,0x00]
-
-ds_or_b64 v2, v[4:5]
-// CHECK: ds_or_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x28,0xd9,0x02,0x04,0x00,0x00]
-
-ds_xor_b64 v2, v[4:5]
-// CHECK: ds_xor_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x2c,0xd9,0x02,0x04,0x00,0x00]
-
-ds_mskor_b64 v2, v[4:5], v[6:7]
-// CHECK: ds_mskor_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x30,0xd9,0x02,0x04,0x06,0x00]
-
-ds_write_b64 v2, v[4:5]
-// CHECK: ds_write_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x34,0xd9,0x02,0x04,0x00,0x00]
-
-ds_write2_b64 v2, v[4:5], v[6:7]
-// CHECK: ds_write2_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x38,0xd9,0x02,0x04,0x06,0x00]
-
-ds_write2st64_b64 v2, v[4:5], v[6:7]
-// CHECK: ds_write2st64_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x3c,0xd9,0x02,0x04,0x06,0x00]
-
-ds_cmpst_b64 v2, v[4:5], v[6:7]
-// CHECK: ds_cmpst_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x40,0xd9,0x02,0x04,0x06,0x00]
-
-ds_cmpst_f64 v2, v[4:5], v[6:7]
-// CHECK: ds_cmpst_f64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x44,0xd9,0x02,0x04,0x06,0x00]
-
-ds_min_f64 v2, v[4:5]
-// CHECK: ds_min_f64 v2, v[4:5] ; encoding: [0x00,0x00,0x48,0xd9,0x02,0x04,0x00,0x00]
-
-ds_max_f64 v2, v[4:5]
-// CHECK: ds_max_f64 v2, v[4:5] ; encoding: [0x00,0x00,0x4c,0xd9,0x02,0x04,0x00,0x00]
-
-ds_add_rtn_u64 v[8:9], v2, v[4:5]
-// CHECK: ds_add_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x80,0xd9,0x02,0x04,0x00,0x08]
-
-ds_sub_rtn_u64 v[8:9], v2, v[4:5]
-// CHECK: ds_sub_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x84,0xd9,0x02,0x04,0x00,0x08]
-
-ds_rsub_rtn_u64 v[8:9], v2, v[4:5]
-// CHECK: ds_rsub_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x88,0xd9,0x02,0x04,0x00,0x08]
-
-ds_inc_rtn_u64 v[8:9], v2, v[4:5]
-// CHECK: ds_inc_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x8c,0xd9,0x02,0x04,0x00,0x08]
-
-ds_dec_rtn_u64 v[8:9] v2, v[4:5]
-// CHECK: ds_dec_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x90,0xd9,0x02,0x04,0x00,0x08]
-
-ds_min_rtn_i64 v[8:9], v2, v[4:5]
-// CHECK: ds_min_rtn_i64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x94,0xd9,0x02,0x04,0x00,0x08]
-
-ds_max_rtn_i64 v[8:9], v2, v[4:5]
-// CHECK: ds_max_rtn_i64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x98,0xd9,0x02,0x04,0x00,0x08]
-
-ds_min_rtn_u64 v[8:9], v2, v[4:5]
-// CHECK: ds_min_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x9c,0xd9,0x02,0x04,0x00,0x08]
-
-ds_max_rtn_u64 v[8:9], v2, v[4:5]
-// CHECK: ds_max_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xa0,0xd9,0x02,0x04,0x00,0x08]
-
-ds_and_rtn_b64 v[8:9], v2, v[4:5]
-// CHECK: ds_and_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xa4,0xd9,0x02,0x04,0x00,0x08]
-
-ds_or_rtn_b64 v[8:9], v2, v[4:5]
-// CHECK: ds_or_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xa8,0xd9,0x02,0x04,0x00,0x08]
-
-ds_xor_rtn_b64 v[8:9], v2, v[4:5]
-// CHECK: ds_xor_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xac,0xd9,0x02,0x04,0x00,0x08]
-
-ds_mskor_rtn_b64 v[8:9], v2, v[4:5], v[6:7]
-// CHECK: ds_mskor_rtn_b64 v[8:9], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xb0,0xd9,0x02,0x04,0x06,0x08]
-
-ds_wrxchg_rtn_b64 v[8:9], v2, v[4:5]
-// CHECK: ds_wrxchg_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xb4,0xd9,0x02,0x04,0x00,0x08]
-
-ds_wrxchg2_rtn_b64 v[8:11], v2, v[4:5], v[6:7]
-// CHECK: ds_wrxchg2_rtn_b64 v[8:11], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xb8,0xd9,0x02,0x04,0x06,0x08]
-
-ds_wrxchg2st64_rtn_b64 v[8:11], v2, v[4:5], v[6:7]
-// CHECK: ds_wrxchg2st64_rtn_b64 v[8:11], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xbc,0xd9,0x02,0x04,0x06,0x08]
-
-ds_cmpst_rtn_b64 v[8:9], v2, v[4:5], v[6:7]
-// CHECK: ds_cmpst_rtn_b64 v[8:9], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xc0,0xd9,0x02,0x04,0x06,0x08]
-
-ds_cmpst_rtn_f64 v[8:9], v2, v[4:5], v[6:7]
-// CHECK: ds_cmpst_rtn_f64 v[8:9], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xc4,0xd9,0x02,0x04,0x06,0x08]
-
-ds_min_rtn_f64 v[8:9], v2, v[4:5]
-// CHECK: ds_min_rtn_f64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xc8,0xd9,0x02,0x04,0x00,0x08]
-
-ds_max_rtn_f64 v[8:9], v2, v[4:5]
-// CHECK: ds_max_rtn_f64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xcc,0xd9,0x02,0x04,0x00,0x08]
-
-ds_read_b64 v[8:9], v2
-// CHECK: ds_read_b64 v[8:9], v2 ; encoding: [0x00,0x00,0xd8,0xd9,0x02,0x00,0x00,0x08]
-
-ds_read2_b64 v[8:11], v2
-// CHECK: ds_read2_b64 v[8:11], v2 ; encoding: [0x00,0x00,0xdc,0xd9,0x02,0x00,0x00,0x08]
-
-ds_read2st64_b64 v[8:11], v2
-// CHECK: ds_read2st64_b64 v[8:11], v2 ; encoding: [0x00,0x00,0xe0,0xd9,0x02,0x00,0x00,0x08]
diff --git a/test/MC/R600/flat.s b/test/MC/R600/flat.s
deleted file mode 100644
index adad29a5595..00000000000
--- a/test/MC/R600/flat.s
+++ /dev/null
@@ -1,477 +0,0 @@
-// RUN: llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck %s --check-prefix=CIVI --check-prefix=CI
-// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=CIVI
-
-// FIXME: These instructions give an 'invalid operand' error on SI and should
-// instead be reporting an 'instruction not supported' error.
-
-// XUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=NOVI
-// XUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI
-// XUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI
-
-//===----------------------------------------------------------------------===//
-// Operands
-//===----------------------------------------------------------------------===//
-
-flat_load_dword v1, v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] ; encoding: [0x00,0x00,0x30,0xdc,0x03,0x00,0x00,0x01]
-
-flat_load_dword v1, v[3:4] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] glc ; encoding: [0x00,0x00,0x31,0xdc,0x03,0x00,0x00,0x01]
-
-flat_load_dword v1, v[3:4] glc slc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x00,0x01]
-
-flat_load_dword v1, v[3:4] glc tfe
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x31,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] glc slc tfe
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] glc tfe slc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] slc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] slc ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x00,0x01]
-
-flat_load_dword v1, v[3:4] slc glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x00,0x01]
-
-flat_load_dword v1, v[3:4] slc tfe
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] slc glc tfe
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] slc tfe glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] tfe
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] tfe ; encoding: [0x00,0x00,0x30,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] tfe glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x31,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] tfe slc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] tfe glc slc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] tfe slc glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01]
-
-flat_store_dword v1, v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x00,0x00]
-
-flat_store_dword v1, v[3:4] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] glc ; encoding: [0x00,0x00,0x71,0xdc,0x03,0x01,0x00,0x00]
-
-flat_store_dword v1, v[3:4] glc slc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x00,0x00]
-
-flat_store_dword v1, v[3:4] glc tfe
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x71,0xdc,0x03,0x01,0x80,0x00]
-
-flat_store_dword v1, v[3:4] glc slc tfe
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00]
-
-flat_store_dword v1, v[3:4] glc tfe slc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00]
-
-flat_store_dword v1, v[3:4] slc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] slc ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x00,0x00]
-
-flat_store_dword v1, v[3:4] slc glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x00,0x00]
-
-flat_store_dword v1, v[3:4] slc tfe
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x80,0x00]
-
-flat_store_dword v1, v[3:4] slc glc tfe
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00]
-
-flat_store_dword v1, v[3:4] slc tfe glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00]
-
-flat_store_dword v1, v[3:4] tfe
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] tfe ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x80,0x00]
-
-flat_store_dword v1, v[3:4] tfe glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x71,0xdc,0x03,0x01,0x80,0x00]
-
-flat_store_dword v1, v[3:4] tfe slc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x80,0x00]
-
-flat_store_dword v1, v[3:4] tfe glc slc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00]
-
-flat_store_dword v1, v[3:4] tfe slc glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00]
-
-// FIXME: For atomic instructions, glc must be placed immediately following
-// the data regiser.  These forms aren't currently supported:
-// flat_atomic_add v1, v[3:4], v5 slc glc
-// flat_atomic_add v1, v[3:4], v5 slc glc tfe
-// flat_atomic_add v1, v[3:4], v5 slc tfe glc
-// flat_atomic_add v1, v[3:4], v5 tfe glc
-// flat_atomic_add v[3:4], v5 tfe glc
-// flat_atomic_add v1, v[3:4], v5 tfe glc slc
-// flat_atomic_add v1, v[3:4], v5 tfe slc glc
-
-flat_atomic_add v1 v[3:4], v5 glc slc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_add v1, v[3:4], v5 glc slc ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x00,0x01]
-
-flat_atomic_add v1 v[3:4], v5 glc tfe
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_add v1, v[3:4], v5 glc tfe ; encoding: [0x00,0x00,0xc9,0xdc,0x03,0x05,0x80,0x01]
-
-flat_atomic_add v1 v[3:4], v5 glc slc tfe
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_add v1, v[3:4], v5 glc slc tfe ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x80,0x01]
-
-flat_atomic_add v1 v[3:4], v5 glc tfe slc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_add v1, v[3:4], v5 glc slc tfe ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x80,0x01]
-
-flat_atomic_add v[3:4], v5 slc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_add v[3:4], v5 slc ; encoding: [0x00,0x00,0xca,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_add v[3:4], v5 slc tfe
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_add v[3:4], v5 slc tfe ; encoding: [0x00,0x00,0xca,0xdc,0x03,0x05,0x80,0x00]
-
-flat_atomic_add v[3:4], v5 tfe
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_add v[3:4], v5 tfe ; encoding: [0x00,0x00,0xc8,0xdc,0x03,0x05,0x80,0x00]
-
-//===----------------------------------------------------------------------===//
-// Instructions
-//===----------------------------------------------------------------------===//
-
-flat_load_ubyte v1, v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_ubyte v1, v[3:4] ; encoding: [0x00,0x00,0x20,0xdc,0x03,0x00,0x00,0x01]
-
-flat_load_sbyte v1, v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_sbyte v1, v[3:4] ; encoding: [0x00,0x00,0x24,0xdc,0x03,0x00,0x00,0x01]
-
-flat_load_ushort v1, v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_ushort v1, v[3:4] ; encoding: [0x00,0x00,0x28,0xdc,0x03,0x00,0x00,0x01]
-
-flat_load_sshort v1, v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_sshort v1, v[3:4] ; encoding: [0x00,0x00,0x2c,0xdc,0x03,0x00,0x00,0x01]
-
-flat_load_dword v1, v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dword v1, v[3:4] ; encoding: [0x00,0x00,0x30,0xdc,0x03,0x00,0x00,0x01]
-
-flat_load_dwordx2 v[1:2], v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dwordx2 v[1:2], v[3:4] ; encoding: [0x00,0x00,0x34,0xdc,0x03,0x00,0x00,0x01]
-
-flat_load_dwordx4 v[5:8], v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dwordx4 v[5:8], v[3:4] ; encoding: [0x00,0x00,0x38,0xdc,0x03,0x00,0x00,0x05]
-
-flat_load_dwordx3 v[5:7], v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_load_dwordx3 v[5:7], v[3:4] ; encoding: [0x00,0x00,0x3c,0xdc,0x03,0x00,0x00,0x05]
-
-flat_store_byte v1, v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_byte v1, v[3:4] ; encoding: [0x00,0x00,0x60,0xdc,0x03,0x01,0x00,0x00]
-
-flat_store_short v1, v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_short v1, v[3:4] ; encoding: [0x00,0x00,0x68,0xdc,0x03,0x01,0x00,0x00]
-
-flat_store_dword v1, v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dword v1, v[3:4] ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x00,0x00]
-
-flat_store_dwordx2 v[1:2], v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dwordx2 v[1:2], v[3:4] ; encoding: [0x00,0x00,0x74,0xdc,0x03,0x01,0x00,0x00]
-
-flat_store_dwordx4 v[5:8], v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dwordx4 v[5:8], v[3:4] ; encoding: [0x00,0x00,0x78,0xdc,0x03,0x05,0x00,0x00]
-
-flat_store_dwordx3 v[5:7], v[3:4]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_store_dwordx3 v[5:7], v[3:4] ; encoding: [0x00,0x00,0x7c,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_swap v[3:4], v5
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_swap v[3:4], v5 ; encoding: [0x00,0x00,0xc0,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_swap v1, v[3:4], v5 glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_swap v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xc1,0xdc,0x03,0x05,0x00,0x01]
-
-flat_atomic_cmpswap v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_cmpswap v[3:4], v[5:6] ; encoding: [0x00,0x00,0xc4,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_cmpswap v1, v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_cmpswap v1, v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0xc5,0xdc,0x03,0x05,0x00,0x01]
-
-flat_atomic_add v[3:4], v5
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_add v[3:4], v5 ; encoding: [0x00,0x00,0xc8,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_add v1, v[3:4], v5 glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_add v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xc9,0xdc,0x03,0x05,0x00,0x01]
-
-flat_atomic_sub v[3:4], v5
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_sub v[3:4], v5 ; encoding: [0x00,0x00,0xcc,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_sub v1, v[3:4], v5 glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_sub v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xcd,0xdc,0x03,0x05,0x00,0x01]
-
-flat_atomic_smin v[3:4], v5
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_smin v[3:4], v5 ; encoding: [0x00,0x00,0xd4,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_smin v1, v[3:4], v5 glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_smin v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xd5,0xdc,0x03,0x05,0x00,0x01]
-
-flat_atomic_umin v[3:4], v5
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_umin v[3:4], v5 ; encoding: [0x00,0x00,0xd8,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_umin v1, v[3:4], v5 glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_umin v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xd9,0xdc,0x03,0x05,0x00,0x01]
-
-flat_atomic_smax v[3:4], v5
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_smax v[3:4], v5 ; encoding: [0x00,0x00,0xdc,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_smax v1, v[3:4], v5 glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_smax v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xdd,0xdc,0x03,0x05,0x00,0x01]
-
-flat_atomic_umax v[3:4], v5
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_umax v[3:4], v5 ; encoding: [0x00,0x00,0xe0,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_umax v1, v[3:4], v5 glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_umax v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xe1,0xdc,0x03,0x05,0x00,0x01]
-
-flat_atomic_and v[3:4], v5
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_and v[3:4], v5 ; encoding: [0x00,0x00,0xe4,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_and v1, v[3:4], v5 glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_and v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xe5,0xdc,0x03,0x05,0x00,0x01]
-
-flat_atomic_or v[3:4], v5
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_or v[3:4], v5 ; encoding: [0x00,0x00,0xe8,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_or v1, v[3:4], v5 glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_or v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xe9,0xdc,0x03,0x05,0x00,0x01]
-
-flat_atomic_xor v[3:4], v5
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_xor v[3:4], v5 ; encoding: [0x00,0x00,0xec,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_xor v1, v[3:4], v5 glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_xor v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xed,0xdc,0x03,0x05,0x00,0x01]
-
-flat_atomic_inc v[3:4], v5
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_inc v[3:4], v5 ; encoding: [0x00,0x00,0xf0,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_inc v1, v[3:4], v5 glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_inc v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xf1,0xdc,0x03,0x05,0x00,0x01]
-
-flat_atomic_dec v[3:4], v5
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_dec v[3:4], v5 ; encoding: [0x00,0x00,0xf4,0xdc,0x03,0x05,0x00,0x00]
-
-flat_atomic_dec v1, v[3:4], v5 glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_dec v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xf5,0xdc,0x03,0x05,0x00,0x01]
-
-flat_atomic_swap_x2 v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_swap_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x40,0xdd,0x03,0x05,0x00,0x00]
-
-flat_atomic_swap_x2 v[1:2], v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_swap_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x41,0xdd,0x03,0x05,0x00,0x01]
-
-flat_atomic_cmpswap_x2 v[3:4], v[5:8]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_cmpswap_x2 v[3:4], v[5:8] ; encoding: [0x00,0x00,0x44,0xdd,0x03,0x05,0x00,0x00]
-
-flat_atomic_cmpswap_x2 v[1:2], v[3:4], v[5:8] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_cmpswap_x2 v[1:2], v[3:4], v[5:8] glc ; encoding: [0x00,0x00,0x45,0xdd,0x03,0x05,0x00,0x01]
-
-flat_atomic_add_x2 v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_add_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x48,0xdd,0x03,0x05,0x00,0x00]
-
-flat_atomic_add_x2 v[1:2], v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_add_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x49,0xdd,0x03,0x05,0x00,0x01]
-
-flat_atomic_sub_x2 v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_sub_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x4c,0xdd,0x03,0x05,0x00,0x00]
-
-flat_atomic_sub_x2 v[1:2], v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_sub_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x4d,0xdd,0x03,0x05,0x00,0x01]
-
-flat_atomic_smin_x2 v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_smin_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x54,0xdd,0x03,0x05,0x00,0x00]
-
-flat_atomic_smin_x2 v[1:2], v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_smin_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x55,0xdd,0x03,0x05,0x00,0x01]
-
-flat_atomic_umin_x2 v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_umin_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x58,0xdd,0x03,0x05,0x00,0x00]
-
-flat_atomic_umin_x2 v[1:2], v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_umin_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x59,0xdd,0x03,0x05,0x00,0x01] 
-
-flat_atomic_smax_x2 v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_smax_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x5c,0xdd,0x03,0x05,0x00,0x00]
-
-flat_atomic_smax_x2 v[1:2], v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_smax_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x5d,0xdd,0x03,0x05,0x00,0x01]
-
-flat_atomic_umax_x2 v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_umax_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x60,0xdd,0x03,0x05,0x00,0x00]
-
-flat_atomic_umax_x2 v[1:2], v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_umax_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x61,0xdd,0x03,0x05,0x00,0x01]
-
-flat_atomic_and_x2 v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_and_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x64,0xdd,0x03,0x05,0x00,0x00]
-
-flat_atomic_and_x2 v[1:2], v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_and_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x65,0xdd,0x03,0x05,0x00,0x01]
-
-flat_atomic_or_x2 v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_or_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x68,0xdd,0x03,0x05,0x00,0x00]
-
-flat_atomic_or_x2 v[1:2], v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_or_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x69,0xdd,0x03,0x05,0x00,0x01]
-
-flat_atomic_xor_x2 v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_xor_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x6c,0xdd,0x03,0x05,0x00,0x00]
-
-flat_atomic_xor_x2 v[1:2], v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_xor_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x6d,0xdd,0x03,0x05,0x00,0x01]
-
-flat_atomic_inc_x2 v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_inc_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x70,0xdd,0x03,0x05,0x00,0x00]
-
-flat_atomic_inc_x2 v[1:2], v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_inc_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x71,0xdd,0x03,0x05,0x00,0x01]
-
-flat_atomic_dec_x2 v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_dec_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x74,0xdd,0x03,0x05,0x00,0x00]
-
-flat_atomic_dec_x2 v[1:2], v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CIVI: flat_atomic_dec_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x75,0xdd,0x03,0x05,0x00,0x01]
-
-flat_atomic_fcmpswap_x2 v[3:4], v[5:8]
-// NOSI: error: instruction not supported on this GPU
-// CI: flat_atomic_fcmpswap_x2 v[3:4], v[5:8] ; encoding: [0x00,0x00,0x78,0xdd,0x03,0x05,0x00,0x00]
-// NOVI: error: instruction not supported on this GPU
-
-flat_atomic_fcmpswap_x2 v[1:2], v[3:4], v[5:8] glc
-// NOSI: error: instruction not supported on this GPU
-// CI: flat_atomic_fcmpswap_x2 v[1:2], v[3:4], v[5:8] glc ; encoding: [0x00,0x00,0x79,0xdd,0x03,0x05,0x00,0x01]
-// NOVI: error: instruction not supported on this GPU
-
-flat_atomic_fmin_x2 v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CI: flat_atomic_fmin_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x7c,0xdd,0x03,0x05,0x00,0x00]
-// NOVI: error: instruction not supported on this GPU
-
-flat_atomic_fmin_x2 v[1:2], v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CI: flat_atomic_fmin_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x7d,0xdd,0x03,0x05,0x00,0x01]
-// NOVI: error: instruction not supported on this GPU
-
-flat_atomic_fmax_x2 v[3:4], v[5:6]
-// NOSI: error: instruction not supported on this GPU
-// CI: flat_atomic_fmax_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x80,0xdd,0x03,0x05,0x00,0x00]
-// NOVI: error: instruction not supported on this GPU
-
-flat_atomic_fmax_x2 v[1:2], v[3:4], v[5:6] glc
-// NOSI: error: instruction not supported on this GPU
-// CI: flat_atomic_fmax_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x81,0xdd,0x03,0x05,0x00,0x01]
-// NOVI: error: instruction not supported on this GPU
diff --git a/test/MC/R600/lit.local.cfg b/test/MC/R600/lit.local.cfg
deleted file mode 100644
index ad9ce2541ef..00000000000
--- a/test/MC/R600/lit.local.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-if not 'R600' in config.root.targets:
-    config.unsupported = True
diff --git a/test/MC/R600/mubuf.s b/test/MC/R600/mubuf.s
deleted file mode 100644
index 78d365abef1..00000000000
--- a/test/MC/R600/mubuf.s
+++ /dev/null
@@ -1,352 +0,0 @@
-// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
-// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
-
-//===----------------------------------------------------------------------===//
-// Test for different operand combinations
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// load - immediate offset only
-//===----------------------------------------------------------------------===//
-
-buffer_load_dword v1, s[4:7], s1
-// CHECK: buffer_load_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_load_dword v1, s[4:7], s1 offset:4
-// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 ; encoding: [0x04,0x00,0x30,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_load_dword v1, s[4:7], s1 offset:4 glc
-// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 glc ; encoding: [0x04,0x40,0x30,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_load_dword v1, s[4:7], s1 offset:4 slc
-// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 slc ; encoding: [0x04,0x00,0x30,0xe0,0x00,0x01,0x41,0x01]
-
-buffer_load_dword v1, s[4:7], s1 offset:4 tfe
-// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 tfe ; encoding: [0x04,0x00,0x30,0xe0,0x00,0x01,0x81,0x01]
-
-buffer_load_dword v1, s[4:7], s1 tfe glc
-// CHECK: buffer_load_dword v1, s[4:7], s1 glc tfe ; encoding: [0x00,0x40,0x30,0xe0,0x00,0x01,0x81,0x01]
-
-buffer_load_dword v1, s[4:7], s1 offset:4 glc tfe slc
-// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x30,0xe0,0x00,0x01,0xc1,0x01]
-
-buffer_load_dword v1, s[4:7], s1 glc tfe slc offset:4
-// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x30,0xe0,0x00,0x01,0xc1,0x01]
-
-//===----------------------------------------------------------------------===//
-// load - vgpr offset
-//===----------------------------------------------------------------------===//
-
-buffer_load_dword v1, v2, s[4:7], s1 offen
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen ; encoding: [0x00,0x10,0x30,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_load_dword v1, v2, s[4:7], s1 offen offset:4
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 ; encoding: [0x04,0x10,0x30,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen  offset:4 glc ; encoding: [0x04,0x50,0x30,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 slc
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 slc ; encoding: [0x04,0x10,0x30,0xe0,0x02,0x01,0x41,0x01]
-
-buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 tfe
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 tfe ; encoding: [0x04,0x10,0x30,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_load_dword v1, v2, s[4:7], s1 offen tfe glc
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen glc tfe ; encoding: [0x00,0x50,0x30,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc tfe slc
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x30,0xe0,0x02,0x01,0xc1,0x01]
-
-buffer_load_dword v1, v2, s[4:7], s1 offen glc tfe slc offset:4
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x30,0xe0,0x02,0x01,0xc1,0x01]
-
-//===----------------------------------------------------------------------===//
-// load - vgpr index
-//===----------------------------------------------------------------------===//
-
-buffer_load_dword v1, v2, s[4:7], s1 idxen
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen ; encoding: [0x00,0x20,0x30,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 ; encoding: [0x04,0x20,0x30,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc ; encoding: [0x04,0x60,0x30,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 slc
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 slc ; encoding: [0x04,0x20,0x30,0xe0,0x02,0x01,0x41,0x01]
-
-buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 tfe
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 tfe ; encoding: [0x04,0x20,0x30,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_load_dword v1, v2, s[4:7], s1 idxen tfe glc
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen glc tfe ; encoding: [0x00,0x60,0x30,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc tfe slc
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x30,0xe0,0x02,0x01,0xc1,0x01]
-
-buffer_load_dword v1, v2, s[4:7], s1 idxen glc tfe slc offset:4
-// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x30,0xe0,0x02,0x01,0xc1,0x01]
-
-//===----------------------------------------------------------------------===//
-// load - vgpr index and offset
-//===----------------------------------------------------------------------===//
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen ; encoding: [0x00,0x30,0x30,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 ; encoding: [0x04,0x30,0x30,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc ; encoding: [0x04,0x70,0x30,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc ; encoding: [0x04,0x30,0x30,0xe0,0x02,0x01,0x41,0x01]
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe ; encoding: [0x04,0x30,0x30,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen tfe glc
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe ; encoding: [0x00,0x70,0x30,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc tfe slc
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x30,0xe0,0x02,0x01,0xc1,0x01]
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe slc offset:4
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x30,0xe0,0x02,0x01,0xc1,0x01]
-
-//===----------------------------------------------------------------------===//
-// load - addr64
-//===----------------------------------------------------------------------===//
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 addr64
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 ; encoding: [0x00,0x80,0x30,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc ; encoding: [0x04,0xc0,0x30,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc ; encoding: [0x04,0x80,0x30,0xe0,0x02,0x01,0x41,0x01]
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe ; encoding: [0x04,0x80,0x30,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 tfe glc
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe ; encoding: [0x00,0xc0,0x30,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc tfe slc
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x30,0xe0,0x02,0x01,0xc1,0x01]
-
-buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe slc offset:4
-// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x30,0xe0,0x02,0x01,0xc1,0x01]
-
-//===----------------------------------------------------------------------===//
-// store - immediate offset only
-//===----------------------------------------------------------------------===//
-
-buffer_store_dword v1, s[4:7], s1
-// CHECK: buffer_store_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_store_dword v1, s[4:7], s1 offset:4
-// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 ; encoding: [0x04,0x00,0x70,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_store_dword v1, s[4:7], s1 offset:4 glc
-// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 glc ; encoding: [0x04,0x40,0x70,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_store_dword v1, s[4:7], s1 offset:4 slc
-// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 slc ; encoding: [0x04,0x00,0x70,0xe0,0x00,0x01,0x41,0x01]
-
-buffer_store_dword v1, s[4:7], s1 offset:4 tfe
-// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 tfe ; encoding: [0x04,0x00,0x70,0xe0,0x00,0x01,0x81,0x01]
-
-buffer_store_dword v1, s[4:7], s1 tfe glc
-// CHECK: buffer_store_dword v1, s[4:7], s1 glc tfe ; encoding: [0x00,0x40,0x70,0xe0,0x00,0x01,0x81,0x01]
-
-buffer_store_dword v1, s[4:7], s1 offset:4 glc tfe slc
-// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x70,0xe0,0x00,0x01,0xc1,0x01]
-
-buffer_store_dword v1, s[4:7], s1 glc tfe slc offset:4
-// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x70,0xe0,0x00,0x01,0xc1,0x01]
-
-//===----------------------------------------------------------------------===//
-// store - vgpr offset
-//===----------------------------------------------------------------------===//
-
-buffer_store_dword v1, v2, s[4:7], s1 offen
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen ; encoding: [0x00,0x10,0x70,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_store_dword v1, v2, s[4:7], s1 offen offset:4
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 ; encoding: [0x04,0x10,0x70,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen  offset:4 glc ; encoding: [0x04,0x50,0x70,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 slc
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 slc ; encoding: [0x04,0x10,0x70,0xe0,0x02,0x01,0x41,0x01]
-
-buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 tfe
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 tfe ; encoding: [0x04,0x10,0x70,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_store_dword v1, v2, s[4:7], s1 offen tfe glc
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen glc tfe ; encoding: [0x00,0x50,0x70,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc tfe slc
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x70,0xe0,0x02,0x01,0xc1,0x01]
-
-buffer_store_dword v1, v2, s[4:7], s1 offen glc tfe slc offset:4
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x70,0xe0,0x02,0x01,0xc1,0x01]
-
-//===----------------------------------------------------------------------===//
-// store - vgpr index
-//===----------------------------------------------------------------------===//
-
-buffer_store_dword v1, v2, s[4:7], s1 idxen
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen ; encoding: [0x00,0x20,0x70,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 ; encoding: [0x04,0x20,0x70,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc ; encoding: [0x04,0x60,0x70,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 slc
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 slc ; encoding: [0x04,0x20,0x70,0xe0,0x02,0x01,0x41,0x01]
-
-buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 tfe
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 tfe ; encoding: [0x04,0x20,0x70,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_store_dword v1, v2, s[4:7], s1 idxen tfe glc
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen glc tfe ; encoding: [0x00,0x60,0x70,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc tfe slc
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x70,0xe0,0x02,0x01,0xc1,0x01]
-
-buffer_store_dword v1, v2, s[4:7], s1 idxen glc tfe slc offset:4
-// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x70,0xe0,0x02,0x01,0xc1,0x01]
-
-//===----------------------------------------------------------------------===//
-// store - vgpr index and offset
-//===----------------------------------------------------------------------===//
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen ; encoding: [0x00,0x30,0x70,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 ; encoding: [0x04,0x30,0x70,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc ; encoding: [0x04,0x70,0x70,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc ; encoding: [0x04,0x30,0x70,0xe0,0x02,0x01,0x41,0x01]
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe ; encoding: [0x04,0x30,0x70,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen tfe glc
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe ; encoding: [0x00,0x70,0x70,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc tfe slc
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x70,0xe0,0x02,0x01,0xc1,0x01]
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe slc offset:4
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x70,0xe0,0x02,0x01,0xc1,0x01]
-
-//===----------------------------------------------------------------------===//
-// store - addr64
-//===----------------------------------------------------------------------===//
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 addr64
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 ; encoding: [0x00,0x80,0x70,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc ; encoding: [0x04,0xc0,0x70,0xe0,0x02,0x01,0x01,0x01]
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc ; encoding: [0x04,0x80,0x70,0xe0,0x02,0x01,0x41,0x01]
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe ; encoding: [0x04,0x80,0x70,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 tfe glc
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe ; encoding: [0x00,0xc0,0x70,0xe0,0x02,0x01,0x81,0x01]
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc tfe slc
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x70,0xe0,0x02,0x01,0xc1,0x01]
-
-buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe slc offset:4
-// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x70,0xe0,0x02,0x01,0xc1,0x01]
-
-//===----------------------------------------------------------------------===//
-// Instructions
-//===----------------------------------------------------------------------===//
-
-buffer_load_format_x v1, s[4:7], s1
-// CHECK: buffer_load_format_x v1, s[4:7], s1 ; encoding: [0x00,0x00,0x00,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_load_format_xy v[1:2], s[4:7], s1
-// CHECK: buffer_load_format_xy v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x04,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_load_format_xyz v[1:3], s[4:7], s1
-// CHECK: buffer_load_format_xyz v[1:3], s[4:7], s1 ; encoding: [0x00,0x00,0x08,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_load_format_xyzw v[1:4], s[4:7], s1
-// CHECK: buffer_load_format_xyzw v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x0c,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_store_format_x v1, s[4:7], s1
-// CHECK: buffer_store_format_x v1, s[4:7], s1 ; encoding: [0x00,0x00,0x10,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_store_format_xy v[1:2], s[4:7], s1
-// CHECK: buffer_store_format_xy v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x14,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_store_format_xyz v[1:3], s[4:7], s1
-// CHECK: buffer_store_format_xyz v[1:3], s[4:7], s1 ; encoding: [0x00,0x00,0x18,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_store_format_xyzw v[1:4], s[4:7], s1
-// CHECK: buffer_store_format_xyzw v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x1c,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_load_ubyte v1, s[4:7], s1
-// CHECK: buffer_load_ubyte v1, s[4:7], s1 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_load_sbyte v1, s[4:7], s1
-// CHECK: buffer_load_sbyte v1, s[4:7], s1 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_load_ushort v1, s[4:7], s1
-// CHECK: buffer_load_ushort v1, s[4:7], s1 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_load_sshort v1, s[4:7], s1
-// CHECK: buffer_load_sshort v1, s[4:7], s1 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_load_dword v1, s[4:7], s1
-// CHECK: buffer_load_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_load_dwordx2 v[1:2], s[4:7], s1
-// CHECK: buffer_load_dwordx2 v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_load_dwordx4 v[1:4], s[4:7], s1
-// CHECK: buffer_load_dwordx4 v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x38,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_store_byte v1, s[4:7], s1
-// CHECK: buffer_store_byte v1, s[4:7], s1 ; encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_store_short v1, s[4:7], s1
-// CHECK: buffer_store_short v1, s[4:7], s1 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_store_dword v1 s[4:7], s1
-// CHECK: buffer_store_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_store_dwordx2 v[1:2], s[4:7], s1
-// CHECK: buffer_store_dwordx2 v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x74,0xe0,0x00,0x01,0x01,0x01]
-
-buffer_store_dwordx4 v[1:4], s[4:7], s1
-// CHECK: buffer_store_dwordx4 v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x78,0xe0,0x00,0x01,0x01,0x01]
-
-// TODO: Atomics
diff --git a/test/MC/R600/smrd.s b/test/MC/R600/smrd.s
deleted file mode 100644
index b67abf7e689..00000000000
--- a/test/MC/R600/smrd.s
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
-// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
-
-s_load_dword s1, s[2:3], 1
-// CHECK: s_load_dword s1, s[2:3], 0x1 ; encoding: [0x01,0x83,0x00,0xc0]
-
-s_load_dword s1, s[2:3], s4
-// CHECK: s_load_dword s1, s[2:3], s4 ; encoding: [0x04,0x82,0x00,0xc0]
-
-s_load_dwordx2 s[2:3], s[2:3], 1
-// CHECK: s_load_dwordx2 s[2:3], s[2:3], 0x1 ; encoding: [0x01,0x03,0x41,0xc0]
-
-s_load_dwordx2 s[2:3], s[2:3], s4
-// CHECK: s_load_dwordx2 s[2:3], s[2:3], s4 ; encoding: [0x04,0x02,0x41,0xc0]
-
-s_load_dwordx4 s[4:7], s[2:3], 1
-// CHECK: s_load_dwordx4 s[4:7], s[2:3], 0x1 ; encoding: [0x01,0x03,0x82,0xc0]
-
-s_load_dwordx4 s[4:7], s[2:3], s4
-// CHECK: s_load_dwordx4 s[4:7], s[2:3], s4 ; encoding: [0x04,0x02,0x82,0xc0]
-
-s_load_dwordx8 s[8:15], s[2:3], 1
-// CHECK: s_load_dwordx8 s[8:15], s[2:3], 0x1 ; encoding: [0x01,0x03,0xc4,0xc0]
-
-s_load_dwordx8 s[8:15], s[2:3], s4
-// CHECK: s_load_dwordx8 s[8:15], s[2:3], s4 ; encoding: [0x04,0x02,0xc4,0xc0]
-
-s_load_dwordx16 s[16:31], s[2:3], 1
-// CHECK: s_load_dwordx16 s[16:31], s[2:3], 0x1 ; encoding: [0x01,0x03,0x08,0xc1]
-
-s_load_dwordx16 s[16:31], s[2:3], s4
-// CHECK: s_load_dwordx16 s[16:31], s[2:3], s4 ; encoding: [0x04,0x02,0x08,0xc1]
diff --git a/test/MC/R600/sop1-err.s b/test/MC/R600/sop1-err.s
deleted file mode 100644
index f892356b623..00000000000
--- a/test/MC/R600/sop1-err.s
+++ /dev/null
@@ -1,37 +0,0 @@
-// RUN: not llvm-mc -arch=amdgcn %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -arch=amdgcn -mcpu=SI %s 2>&1 | FileCheck %s
-
-s_mov_b32 v1, s2
-// CHECK: error: invalid operand for instruction
-
-s_mov_b32 s1, v0
-// CHECK: error: invalid operand for instruction
-
-s_mov_b32 s[1:2], s0
-// CHECK: error: invalid operand for instruction
-
-s_mov_b32 s0, s[1:2]
-// CHECK: error: invalid operand for instruction
-
-s_mov_b32 s220, s0
-// CHECK: error: invalid operand for instruction
-
-s_mov_b32 s0, s220
-// CHECK: error: invalid operand for instruction
-
-s_mov_b64 s1, s[0:1]
-// CHECK: error: invalid operand for instruction
-
-s_mov_b64 s[0:1], s1
-// CHECK: error: invalid operand for instruction
-
-// Immediate greater than 32-bits
-s_mov_b32 s1, 0xfffffffff
-// CHECK: error: invalid immediate: only 32-bit values are legal
-
-// Immediate greater than 32-bits
-s_mov_b64 s[0:1], 0xfffffffff
-// CHECK: error: invalid immediate: only 32-bit values are legal
-
-// Out of range register
-s_mov_b32 s
diff --git a/test/MC/R600/sop1.s b/test/MC/R600/sop1.s
deleted file mode 100644
index 92ca73f2500..00000000000
--- a/test/MC/R600/sop1.s
+++ /dev/null
@@ -1,177 +0,0 @@
-// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
-// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
-
-s_mov_b32 s1, s2
-// CHECK: s_mov_b32 s1, s2 ; encoding: [0x02,0x03,0x81,0xbe]
-
-s_mov_b32 s1, 1
-// CHECK: s_mov_b32 s1, 1 ; encoding: [0x81,0x03,0x81,0xbe]
-
-s_mov_b32 s1, 100
-// CHECK: s_mov_b32 s1, 0x64 ; encoding: [0xff,0x03,0x81,0xbe,0x64,0x00,0x00,0x00]
-
-s_mov_b64 s[2:3], s[4:5]
-// CHECK: s_mov_b64 s[2:3], s[4:5] ; encoding: [0x04,0x04,0x82,0xbe]
-
-s_mov_b64 s[2:3], 0xffffffffffffffff
-// CHECK: s_mov_b64 s[2:3], -1 ; encoding: [0xc1,0x04,0x82,0xbe]
-
-s_cmov_b32 s1, 200
-// CHECK: s_cmov_b32 s1, 0xc8 ; encoding: [0xff,0x05,0x81,0xbe,0xc8,0x00,0x00,0x00]
-
-s_cmov_b32 s1, 1.0
-// CHECK: s_cmov_b32 s1, 1.0 ; encoding: [0xf2,0x05,0x81,0xbe]
-
-//s_cmov_b64 s[2:3], 1.0
-//CHECK-FIXME: s_cmov_b64 s[2:3], 1.0 ; encoding: [0xf2,0x05,0x82,0xb3]
-
-//===----------------------------------------------------------------------===//
-// Instructions
-//===----------------------------------------------------------------------===//
-
-s_mov_b32 s1, s2
-// CHECK: s_mov_b32 s1, s2 ; encoding: [0x02,0x03,0x81,0xbe]
-
-s_mov_b64 s[2:3], s[4:5]
-// CHECK: s_mov_b64 s[2:3], s[4:5] ; encoding: [0x04,0x04,0x82,0xbe]
-
-s_cmov_b32 s1, s2
-// CHECK: s_cmov_b32 s1, s2 ; encoding: [0x02,0x05,0x81,0xbe]
-
-s_cmov_b64 s[2:3], s[4:5]
-// CHECK: s_cmov_b64 s[2:3], s[4:5] ; encoding: [0x04,0x06,0x82,0xbe]
-
-s_not_b32 s1, s2
-// CHECK: s_not_b32 s1, s2 ; encoding: [0x02,0x07,0x81,0xbe]
-
-s_not_b64 s[2:3], s[4:5]
-// CHECK: s_not_b64 s[2:3], s[4:5] ; encoding: [0x04,0x08,0x82,0xbe]
-
-s_wqm_b32 s1, s2
-// CHECK: s_wqm_b32 s1, s2 ; encoding: [0x02,0x09,0x81,0xbe]
-
-s_wqm_b64 s[2:3], s[4:5]
-// CHECK: s_wqm_b64 s[2:3], s[4:5] ; encoding: [0x04,0x0a,0x82,0xbe]
-
-s_brev_b32 s1, s2
-// CHECK: s_brev_b32 s1, s2 ; encoding: [0x02,0x0b,0x81,0xbe]
-
-s_brev_b64 s[2:3], s[4:5]
-// CHECK: s_brev_b64 s[2:3], s[4:5] ; encoding: [0x04,0x0c,0x82,0xbe]
-
-s_bcnt0_i32_b32 s1, s2
-// CHECK: s_bcnt0_i32_b32 s1, s2 ; encoding: [0x02,0x0d,0x81,0xbe]
-
-s_bcnt0_i32_b64 s1, s[2:3]
-// CHECK: s_bcnt0_i32_b64 s1, s[2:3] ; encoding: [0x02,0x0e,0x81,0xbe]
-
-s_bcnt1_i32_b32 s1, s2
-// CHECK: s_bcnt1_i32_b32 s1, s2 ; encoding: [0x02,0x0f,0x81,0xbe]
-
-s_bcnt1_i32_b64 s1, s[2:3]
-// CHECK: s_bcnt1_i32_b64 s1, s[2:3] ; encoding: [0x02,0x10,0x81,0xbe]
-
-s_ff0_i32_b32 s1, s2
-// CHECK: s_ff0_i32_b32 s1, s2 ; encoding: [0x02,0x11,0x81,0xbe]
-
-s_ff0_i32_b64 s1, s[2:3]
-// CHECK: s_ff0_i32_b64 s1, s[2:3] ; encoding: [0x02,0x12,0x81,0xbe]
-
-s_ff1_i32_b32 s1, s2
-// CHECK: s_ff1_i32_b32 s1, s2 ; encoding: [0x02,0x13,0x81,0xbe]
-
-s_ff1_i32_b64 s1, s[2:3]
-// CHECK: s_ff1_i32_b64 s1, s[2:3] ; encoding: [0x02,0x14,0x81,0xbe]
-
-s_flbit_i32_b32 s1, s2
-// CHECK: s_flbit_i32_b32 s1, s2 ; encoding: [0x02,0x15,0x81,0xbe]
-
-s_flbit_i32_b64 s1, s[2:3]
-// CHECK: s_flbit_i32_b64 s1, s[2:3] ; encoding: [0x02,0x16,0x81,0xbe]
-
-s_flbit_i32 s1, s2
-// CHECK: s_flbit_i32 s1, s2 ; encoding: [0x02,0x17,0x81,0xbe]
-
-s_flbit_i32_i64 s1, s[2:3]
-// CHECK: s_flbit_i32_i64 s1, s[2:3] ; encoding: [0x02,0x18,0x81,0xbe]
-
-s_sext_i32_i8 s1, s2
-// CHECK: s_sext_i32_i8 s1, s2 ; encoding: [0x02,0x19,0x81,0xbe]
-
-s_sext_i32_i16 s1, s2
-// CHECK: s_sext_i32_i16 s1, s2 ; encoding: [0x02,0x1a,0x81,0xbe]
-
-s_bitset0_b32 s1, s2
-// CHECK: s_bitset0_b32 s1, s2 ; encoding: [0x02,0x1b,0x81,0xbe]
-
-s_bitset0_b64 s[2:3], s[4:5]
-// CHECK: s_bitset0_b64 s[2:3], s[4:5] ; encoding: [0x04,0x1c,0x82,0xbe]
-
-s_bitset1_b32 s1, s2
-// CHECK: s_bitset1_b32 s1, s2 ; encoding: [0x02,0x1d,0x81,0xbe]
-
-s_bitset1_b64 s[2:3], s[4:5]
-// CHECK: s_bitset1_b64 s[2:3], s[4:5] ; encoding: [0x04,0x1e,0x82,0xbe]
-
-s_getpc_b64 s[2:3]
-// CHECK: s_getpc_b64 s[2:3] ; encoding: [0x00,0x1f,0x82,0xbe]
-
-s_setpc_b64 s[2:3], s[4:5]
-// CHECK: s_setpc_b64 s[2:3], s[4:5] ; encoding: [0x04,0x20,0x82,0xbe]
-
-s_swappc_b64 s[2:3], s[4:5]
-// CHECK: s_swappc_b64 s[2:3], s[4:5] ; encoding: [0x04,0x21,0x82,0xbe]
-
-s_rfe_b64 s[2:3], s[4:5]
-// CHECK: s_rfe_b64 s[2:3], s[4:5] ; encoding: [0x04,0x22,0x82,0xbe]
-
-s_and_saveexec_b64 s[2:3], s[4:5]
-// CHECK: s_and_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x24,0x82,0xbe]
-
-s_or_saveexec_b64 s[2:3], s[4:5]
-// CHECK: s_or_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x25,0x82,0xbe]
-
-s_xor_saveexec_b64 s[2:3], s[4:5]
-// CHECK: s_xor_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x26,0x82,0xbe]
-
-s_andn2_saveexec_b64 s[2:3], s[4:5]
-// CHECK: s_andn2_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x27,0x82,0xbe]
-
-s_orn2_saveexec_b64 s[2:3], s[4:5]
-// CHECK: s_orn2_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x28,0x82,0xbe]
-
-s_nand_saveexec_b64 s[2:3], s[4:5]
-// CHECK: s_nand_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x29,0x82,0xbe]
-
-s_nor_saveexec_b64 s[2:3], s[4:5]
-// CHECK: s_nor_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2a,0x82,0xbe]
-
-s_xnor_saveexec_b64 s[2:3], s[4:5]
-// CHECK: s_xnor_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2b,0x82,0xbe]
-
-s_quadmask_b32 s1, s2
-// CHECK: s_quadmask_b32 s1, s2 ; encoding: [0x02,0x2c,0x81,0xbe]
-
-s_quadmask_b64 s[2:3], s[4:5]
-// CHECK: s_quadmask_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2d,0x82,0xbe]
-
-s_movrels_b32 s1, s2
-// CHECK: s_movrels_b32 s1, s2 ; encoding: [0x02,0x2e,0x81,0xbe]
-
-s_movrels_b64 s[2:3], s[4:5]
-// CHECK: s_movrels_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2f,0x82,0xbe]
-
-s_movreld_b32 s1, s2
-// CHECK: s_movreld_b32 s1, s2 ; encoding: [0x02,0x30,0x81,0xbe]
-
-s_movreld_b64 s[2:3], s[4:5]
-// CHECK: s_movreld_b64 s[2:3], s[4:5] ; encoding: [0x04,0x31,0x82,0xbe]
-
-s_cbranch_join s[4:5]
-// CHECK: s_cbranch_join s[4:5] ; encoding: [0x04,0x32,0x80,0xbe]
-
-s_abs_i32 s1, s2
-// CHECK: s_abs_i32 s1, s2 ; encoding: [0x02,0x34,0x81,0xbe]
-
-s_mov_fed_b32 s1, s2
-// CHECK: s_mov_fed_b32 s1, s2 ; encoding: [0x02,0x35,0x81,0xbe]
diff --git a/test/MC/R600/sop2.s b/test/MC/R600/sop2.s
deleted file mode 100644
index 9a7a1c01064..00000000000
--- a/test/MC/R600/sop2.s
+++ /dev/null
@@ -1,131 +0,0 @@
-// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
-// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
-
-// CHECK: s_add_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x80]
-s_add_u32 s1, s2, s3
-
-// CHECK: s_sub_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x80]
-s_sub_u32 s1, s2, s3
-
-// CHECK: s_add_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x81]
-s_add_i32 s1, s2, s3
-
-// CHECK: s_sub_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x81]
-s_sub_i32 s1, s2, s3
-
-// CHECK: s_addc_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x82]
-s_addc_u32 s1, s2, s3
-
-// CHECK: s_subb_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x82]
-s_subb_u32 s1, s2, s3
-
-// CHECK: s_min_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x83]
-s_min_i32 s1, s2, s3
-
-// CHECK: s_min_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x83]
-s_min_u32 s1, s2, s3
-
-// CHECK: s_max_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x84]
-s_max_i32 s1, s2, s3
-
-// CHECK: s_max_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x84]
-s_max_u32 s1, s2, s3
-
-// CHECK: s_cselect_b32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x85]
-s_cselect_b32 s1, s2, s3
-
-// CHECK: s_cselect_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x85]
-s_cselect_b64 s[2:3], s[4:5], s[6:7]
-
-// CHECK: s_and_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x87]
-s_and_b32 s2, s4, s6
-
-// CHECK: s_and_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x87]
-s_and_b64 s[2:3], s[4:5], s[6:7]
-
-// CHECK: s_or_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x88]
-s_or_b32 s2, s4, s6
-
-// CHECK: s_or_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x88]
-s_or_b64 s[2:3], s[4:5], s[6:7]
-
-// CHECK: s_xor_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x89]
-s_xor_b32 s2, s4, s6
-
-// CHECK: s_xor_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x89]
-s_xor_b64 s[2:3], s[4:5], s[6:7]
-
-// CHECK: s_andn2_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8a]
-s_andn2_b32 s2, s4, s6
-
-// CHECK: s_andn2_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8a]
-s_andn2_b64 s[2:3], s[4:5], s[6:7]
-
-// CHECK: s_orn2_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8b]
-s_orn2_b32 s2, s4, s6
-
-// CHECK: s_orn2_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8b]
-s_orn2_b64 s[2:3], s[4:5], s[6:7]
-
-// CHECK: s_nand_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8c]
-s_nand_b32 s2, s4, s6
-
-// CHECK: s_nand_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8c]
-s_nand_b64 s[2:3], s[4:5], s[6:7]
-
-// CHECK: s_nor_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8d]
-s_nor_b32 s2, s4, s6
-
-// CHECK: s_nor_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8d]
-s_nor_b64 s[2:3], s[4:5], s[6:7]
-
-// CHECK: s_xnor_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8e]
-s_xnor_b32 s2, s4, s6
-
-// CHECK: s_xnor_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8e]
-s_xnor_b64 s[2:3], s[4:5], s[6:7]
-
-// CHECK: s_lshl_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8f]
-s_lshl_b32 s2, s4, s6
-
-// CHECK: s_lshl_b64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x82,0x8f]
-s_lshl_b64 s[2:3], s[4:5], s6
-
-// CHECK: s_lshr_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x90]
-s_lshr_b32 s2, s4, s6
-
-// CHECK: s_lshr_b64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x82,0x90]
-s_lshr_b64 s[2:3], s[4:5], s6
-
-// CHECK: s_ashr_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x91]
-s_ashr_i32 s2, s4, s6
-
-// CHECK: s_ashr_i64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x82,0x91]
-s_ashr_i64 s[2:3], s[4:5], s6
-
-// CHECK: s_bfm_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x92]
-s_bfm_b32 s2, s4, s6
-
-// CHECK: s_bfm_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x92]
-s_bfm_b64 s[2:3], s[4:5], s[6:7]
-
-// CHECK: s_mul_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x93]
-s_mul_i32 s2, s4, s6
-
-// CHECK: s_bfe_u32 s2, s4, s6 ; encoding: [0x04,0x06,0x82,0x93]
-s_bfe_u32 s2, s4, s6
-
-// CHECK: s_bfe_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x94]
-s_bfe_i32 s2, s4, s6
-
-// CHECK: s_bfe_u64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x94]
-s_bfe_u64 s[2:3], s[4:5], s[6:7]
-
-// CHECK: s_bfe_i64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x02,0x95]
-s_bfe_i64 s[2:3], s[4:5], s6
-
-// CHECK: s_cbranch_g_fork s[4:5], s[6:7] ; encoding: [0x04,0x06,0x80,0x95]
-s_cbranch_g_fork s[4:5], s[6:7]
-
-// CHECK: s_absdiff_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x96]
-s_absdiff_i32 s2, s4, s6
diff --git a/test/MC/R600/sopc.s b/test/MC/R600/sopc.s
deleted file mode 100644
index 0899c1a2eed..00000000000
--- a/test/MC/R600/sopc.s
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
-// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
-
-//===----------------------------------------------------------------------===//
-// Instructions
-//===----------------------------------------------------------------------===//
-
-s_cmp_eq_i32 s1, s2
-// CHECK: s_cmp_eq_i32 s1, s2 ; encoding: [0x01,0x02,0x00,0xbf]
diff --git a/test/MC/R600/sopk.s b/test/MC/R600/sopk.s
deleted file mode 100644
index 6c27aaccb80..00000000000
--- a/test/MC/R600/sopk.s
+++ /dev/null
@@ -1,66 +0,0 @@
-// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
-// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
-
-//===----------------------------------------------------------------------===//
-// Instructions
-//===----------------------------------------------------------------------===//
-
-s_movk_i32 s2, 0x6
-// CHECK: s_movk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb0]
-
-s_cmovk_i32 s2, 0x6
-// CHECK: s_cmovk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb1]
-
-s_cmpk_eq_i32 s2, 0x6
-// CHECK: s_cmpk_eq_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb1]
-
-s_cmpk_lg_i32 s2, 0x6
-// CHECK: s_cmpk_lg_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb2]
-
-s_cmpk_gt_i32 s2, 0x6
-// CHECK: s_cmpk_gt_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb2]
-
-s_cmpk_ge_i32 s2, 0x6
-// CHECK: s_cmpk_ge_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb3]
-
-s_cmpk_lt_i32 s2, 0x6
-// CHECK: s_cmpk_lt_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb3]
-
-s_cmpk_le_i32 s2, 0x6
-// CHECK: s_cmpk_le_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb4]
-
-s_cmpk_eq_u32 s2, 0x6
-// CHECK: s_cmpk_eq_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb4]
-
-s_cmpk_lg_u32 s2, 0x6
-// CHECK: s_cmpk_lg_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb5]
-
-s_cmpk_gt_u32 s2, 0x6
-// CHECK: s_cmpk_gt_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb5]
-
-s_cmpk_ge_u32 s2, 0x6
-// CHECK: s_cmpk_ge_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb6]
-
-s_cmpk_lt_u32 s2, 0x6
-// CHECK: s_cmpk_lt_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb6]
-
-s_cmpk_le_u32 s2, 0x6
-// CHECK: s_cmpk_le_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb7]
-
-s_addk_i32 s2, 0x6
-// CHECK: s_addk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb7]
-
-s_mulk_i32 s2, 0x6
-// CHECK: s_mulk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb8]
-
-s_cbranch_i_fork s[2:3], 0x6
-// CHECK: s_cbranch_i_fork s[2:3], 0x6 ; encoding: [0x06,0x00,0x82,0xb8]
-
-s_getreg_b32 s2, 0x6
-// CHECK: s_getreg_b32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb9]
-
-s_setreg_b32 s2, 0x6
-// CHECK: s_setreg_b32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb9]
-
-s_setreg_imm32_b32 0xff, 0x6
-// CHECK: s_setreg_imm32_b32 0xff, 0x6 ; encoding: [0x06,0x00,0x80,0xba,0xff,0x00,0x00,0x00]
diff --git a/test/MC/R600/sopp.s b/test/MC/R600/sopp.s
deleted file mode 100644
index b072c16fdb2..00000000000
--- a/test/MC/R600/sopp.s
+++ /dev/null
@@ -1,64 +0,0 @@
-// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
-// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
-
-//===----------------------------------------------------------------------===//
-// Edge Cases
-//===----------------------------------------------------------------------===//
-
-s_nop 0       // CHECK: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] 
-s_nop 0xffff  // CHECK: s_nop 0xffff ; encoding: [0xff,0xff,0x80,0xbf]
-
-//===----------------------------------------------------------------------===//
-// Instructions
-//===----------------------------------------------------------------------===//
-
-  s_nop 1            // CHECK: s_nop 1 ; encoding: [0x01,0x00,0x80,0xbf]
-  s_endpgm           // CHECK: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
-  s_branch 2         // CHECK: s_branch 2 ; encoding: [0x02,0x00,0x82,0xbf]
-  s_cbranch_scc0 3   // CHECK: s_cbranch_scc0 3 ; encoding: [0x03,0x00,0x84,0xbf]
-  s_cbranch_scc1 4   // CHECK: s_cbranch_scc1 4 ; encoding: [0x04,0x00,0x85,0xbf]
-  s_cbranch_vccz 5   // CHECK: s_cbranch_vccz 5 ; encoding: [0x05,0x00,0x86,0xbf]
-  s_cbranch_vccnz 6  // CHECK: s_cbranch_vccnz 6 ; encoding: [0x06,0x00,0x87,0xbf]
-  s_cbranch_execz 7  // CHECK: s_cbranch_execz 7 ; encoding: [0x07,0x00,0x88,0xbf]
-  s_cbranch_execnz 8 // CHECK: s_cbranch_execnz 8 ; encoding: [0x08,0x00,0x89,0xbf]
-  s_barrier          // CHECK: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf]
-
-//===----------------------------------------------------------------------===//
-// s_waitcnt
-//===----------------------------------------------------------------------===//
-
-  s_waitcnt 0
-  // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
-
-  s_waitcnt vmcnt(0) & expcnt(0) & lgkmcnt(0)
-  // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
-
-  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-  // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
-
-  s_waitcnt vmcnt(0), expcnt(0), lgkmcnt(0)
-  // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
-
-  s_waitcnt vmcnt(1)
-  // CHECK: s_waitcnt vmcnt(1) ; encoding: [0x71,0x07,0x8c,0xbf]
-
-  s_waitcnt expcnt(2)
-  // CHECK: s_waitcnt expcnt(2) ; encoding: [0x2f,0x07,0x8c,0xbf]
-
-  s_waitcnt lgkmcnt(3)
-  // CHECK: s_waitcnt lgkmcnt(3) ; encoding: [0x7f,0x03,0x8c,0xbf]
-
-  s_waitcnt vmcnt(0), expcnt(0)
-  // CHECK: s_waitcnt vmcnt(0) expcnt(0) ; encoding: [0x00,0x07,0x8c,0xbf]
-
-
-  s_sethalt 9        // CHECK: s_sethalt 9 ; encoding: [0x09,0x00,0x8d,0xbf]
-  s_sleep 10         // CHECK: s_sleep 10 ; encoding: [0x0a,0x00,0x8e,0xbf]
-  s_setprio 1        // CHECK: s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
-  s_sendmsg 2        // CHECK: s_sendmsg Gs(nop), [m0] ; encoding: [0x02,0x00,0x90,0xbf]
-  s_sendmsghalt 3    // CHECK: s_sendmsghalt 3 ; encoding: [0x03,0x00,0x91,0xbf]
-  s_trap 4           // CHECK: s_trap 4 ; encoding: [0x04,0x00,0x92,0xbf]
-  s_icache_inv       // CHECK: s_icache_inv ; encoding: [0x00,0x00,0x93,0xbf]
-  s_incperflevel 5   // CHECK: s_incperflevel 5 ; encoding: [0x05,0x00,0x94,0xbf]
-  s_decperflevel 6   // CHECK: s_decperflevel 6 ; encoding: [0x06,0x00,0x95,0xbf]
-  s_ttracedata       // CHECK: s_ttracedata ; encoding: [0x00,0x00,0x96,0xbf]
diff --git a/test/MC/R600/vop1.s b/test/MC/R600/vop1.s
deleted file mode 100644
index d0b00fcd189..00000000000
--- a/test/MC/R600/vop1.s
+++ /dev/null
@@ -1,357 +0,0 @@
-// RUN: not llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SI --check-prefix=SICI
-// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SI --check-prefix=SICI
-// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI --check-prefix=CIVI
-// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI
-
-// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI
-// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI
-// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI
-// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s -check-prefix=NOVI
-
-
-// GCN: v_nop ; encoding: [0x00,0x00,0x00,0x7e]
-v_nop
-
-// GCN: v_mov_b32_e32 v1, v2 ; encoding: [0x02,0x03,0x02,0x7e]
-v_mov_b32 v1, v2
-
-// GCN: v_readfirstlane_b32 s1, v2 ; encoding: [0x02,0x05,0x02,0x7e]
-v_readfirstlane_b32 s1, v2
-
-// GCN: v_cvt_i32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x07,0x02,0x7e]
-v_cvt_i32_f64 v1, v[2:3]
-
-// GCN: v_cvt_f64_i32_e32 v[1:2], v2 ; encoding: [0x02,0x09,0x02,0x7e]
-v_cvt_f64_i32 v[1:2], v2
-
-// GCN: v_cvt_f32_i32_e32 v1, v2 ; encoding: [0x02,0x0b,0x02,0x7e]
-v_cvt_f32_i32 v1, v2
-
-// GCN: v_cvt_f32_u32_e32 v1, v2 ; encoding: [0x02,0x0d,0x02,0x7e]
-v_cvt_f32_u32 v1, v2
-
-// GCN: v_cvt_u32_f32_e32 v1, v2 ; encoding: [0x02,0x0f,0x02,0x7e
-v_cvt_u32_f32 v1, v2
-
-// GCN: v_cvt_i32_f32_e32 v1, v2 ; encoding: [0x02,0x11,0x02,0x7e]
-v_cvt_i32_f32 v1, v2
-
-// SICI: v_mov_fed_b32_e32 v1, v2 ; encoding: [0x02,0x13,0x02,0x7e]
-// NOVI: error: instruction not supported on this GPU
-v_mov_fed_b32 v1, v2
-
-// GCN: v_cvt_f16_f32_e32 v1, v2 ; encoding: [0x02,0x15,0x02,0x7e]
-v_cvt_f16_f32 v1, v2
-
-// GCN: v_cvt_f32_f16_e32 v1, v2 ; encoding: [0x02,0x17,0x02,0x7e]
-v_cvt_f32_f16 v1, v2
-
-// GCN: v_cvt_rpi_i32_f32_e32 v1, v2 ; encoding: [0x02,0x19,0x02,0x7e]
-v_cvt_rpi_i32_f32 v1, v2
-
-// GCN: v_cvt_flr_i32_f32_e32 v1, v2 ; encoding: [0x02,0x1b,0x02,0x7e]
-v_cvt_flr_i32_f32 v1, v2
-
-// GCN: v_cvt_off_f32_i4_e32 v1, v2 ; encoding: [0x02,0x1d,0x02,0x7e]
-v_cvt_off_f32_i4_e32 v1, v2
-
-// GCN: v_cvt_f32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x1f,0x02,0x7e]
-v_cvt_f32_f64 v1, v[2:3]
-
-// GCN: v_cvt_f64_f32_e32 v[1:2], v2 ; encoding: [0x02,0x21,0x02,0x7e]
-v_cvt_f64_f32 v[1:2], v2
-
-// GCN: v_cvt_f32_ubyte0_e32 v1, v2 ; encoding: [0x02,0x23,0x02,0x7e]
-v_cvt_f32_ubyte0 v1, v2
-
-// GCN: v_cvt_f32_ubyte1_e32 v1, v2 ; encoding: [0x02,0x25,0x02,0x7e]
-v_cvt_f32_ubyte1_e32 v1, v2
-
-// GCN: v_cvt_f32_ubyte2_e32 v1, v2 ; encoding: [0x02,0x27,0x02,0x7e]
-v_cvt_f32_ubyte2 v1, v2
-
-// GCN: v_cvt_f32_ubyte3_e32 v1, v2 ; encoding: [0x02,0x29,0x02,0x7e]
-v_cvt_f32_ubyte3 v1, v2
-
-// GCN: v_cvt_u32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x2b,0x02,0x7e]
-v_cvt_u32_f64 v1, v[2:3]
-
-// GCN: v_cvt_f64_u32_e32 v[1:2], v2 ; encoding: [0x02,0x2d,0x02,0x7e]
-v_cvt_f64_u32 v[1:2], v2
-
-// NOSI: error: instruction not supported on this GPU
-// NOSI: v_trunc_f64_e32 v[1:2], v[2:3]
-// CIVI: v_trunc_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x2f,0x02,0x7e]
-v_trunc_f64_e32 v[1:2], v[2:3]
-
-// NOSI: error: instruction not supported on this GPU
-// NOSI: v_ceil_f64_e32 v[1:2], v[2:3]
-// CIVI: v_ceil_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x31,0x02,0x7e]
-v_ceil_f64_e32 v[1:2], v[2:3]
-
-// NOSI: error: instruction not supported on this GPU
-// NOSI: v_rndne_f64_e32 v[1:2], v[2:3]
-// CIVI: v_rndne_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x33,0x02,0x7e]
-v_rndne_f64_e32 v[1:2], v[2:3]
-
-// NOSI: error: instruction not supported on this GPU
-// NOSI: v_floor_f64_e32 v[1:2], v[2:3]
-// CIVI: v_floor_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x35,0x02,0x7e]
-v_floor_f64_e32 v[1:2], v[2:3]
-
-// SICI: v_fract_f32_e32 v1, v2 ; encoding: [0x02,0x41,0x02,0x7e]
-// VI:   v_fract_f32_e32 v1, v2 ; encoding: [0x02,0x37,0x02,0x7e]
-v_fract_f32 v1, v2
-
-// SICI: v_trunc_f32_e32 v1, v2 ; encoding: [0x02,0x43,0x02,0x7e]
-// VI:   v_trunc_f32_e32 v1, v2 ; encoding: [0x02,0x39,0x02,0x7e]
-v_trunc_f32 v1, v2
-
-// SICI: v_ceil_f32_e32 v1, v2 ; encoding: [0x02,0x45,0x02,0x7e]
-// VI:   v_ceil_f32_e32 v1, v2 ; encoding: [0x02,0x3b,0x02,0x7e]
-v_ceil_f32 v1, v2
-
-// SICI: v_rndne_f32_e32 v1, v2 ; encoding: [0x02,0x47,0x02,0x7e]
-// VI:   v_rndne_f32_e32 v1, v2 ; encoding: [0x02,0x3d,0x02,0x7e]
-v_rndne_f32 v1, v2
-
-// SICI: v_floor_f32_e32 v1, v2 ; encoding: [0x02,0x49,0x02,0x7e]
-// VI:   v_floor_f32_e32 v1, v2 ; encoding: [0x02,0x3f,0x02,0x7e]
-v_floor_f32_e32 v1, v2
-
-// SICI: v_exp_f32_e32 v1, v2 ; encoding: [0x02,0x4b,0x02,0x7e]
-// VI:   v_exp_f32_e32 v1, v2 ; encoding: [0x02,0x41,0x02,0x7e]
-v_exp_f32 v1, v2
-
-// SICI: v_log_clamp_f32_e32 v1, v2 ; encoding: [0x02,0x4d,0x02,0x7e]
-// NOVI: error: instruction not supported on this GPU
-// NOVI: v_log_clamp_f32 v1, v2
-v_log_clamp_f32 v1, v2
-
-// SICI: v_log_f32_e32 v1, v2 ; encoding: [0x02,0x4f,0x02,0x7e]
-// VI:   v_log_f32_e32 v1, v2 ; encoding: [0x02,0x43,0x02,0x7e]
-v_log_f32 v1, v2
-
-// SICI: v_rcp_clamp_f32_e32 v1, v2 ; encoding: [0x02,0x51,0x02,0x7e]
-// NOVI: error: instruction not supported on this GPU
-// NOVI: v_rcp_clamp_f32 v1, v2
-v_rcp_clamp_f32 v1, v2
-
-// SICI: v_rcp_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x53,0x02,0x7e]
-// NOVI: error: instruction not supported on this GPU
-// NOVI: v_rcp_legacy_f32 v1, v2
-v_rcp_legacy_f32 v1, v2
-
-// SICI: v_rcp_f32_e32 v1, v2 ; encoding: [0x02,0x55,0x02,0x7e]
-// VI:   v_rcp_f32_e32 v1, v2 ; encoding: [0x02,0x45,0x02,0x7e]
-v_rcp_f32 v1, v2
-
-// SICI: v_rcp_iflag_f32_e32 v1, v2 ; encoding: [0x02,0x57,0x02,0x7e]
-// VI:   v_rcp_iflag_f32_e32 v1, v2 ; encoding: [0x02,0x47,0x02,0x7e]
-v_rcp_iflag_f32 v1, v2
-
-// SICI: v_rsq_clamp_f32_e32 v1, v2 ; encoding: [0x02,0x59,0x02,0x7e]
-// NOVI: error: instruction not supported on this GPU
-// NOVI: v_rsq_clamp_f32 v1, v2
-v_rsq_clamp_f32 v1, v2
-
-// SICI: v_rsq_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x5b,0x02,0x7e]
-// NOVI: error: instruction not supported on this GPU
-// NOVI: v_rsq_legacy_f32 v1, v2
-v_rsq_legacy_f32 v1, v2
-
-// SICI: v_rsq_f32_e32 v1, v2 ; encoding: [0x02,0x5d,0x02,0x7e]
-// VI:   v_rsq_f32_e32 v1, v2 ; encoding: [0x02,0x49,0x02,0x7e]
-v_rsq_f32_e32 v1, v2
-
-// SICI: v_rcp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x5f,0x02,0x7e]
-// VI:   v_rcp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x4b,0x02,0x7e]
-v_rcp_f64 v[1:2], v[2:3]
-
-// SICI: v_rcp_clamp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x61,0x02,0x7e]
-// NOVI: error: instruction not supported on this GPU
-// NOVI: v_rcp_clamp_f64 v[1:2], v[2:3]
-v_rcp_clamp_f64 v[1:2], v[2:3]
-
-// SICI: v_rsq_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x63,0x02,0x7e]
-// VI:   v_rsq_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x4d,0x02,0x7e]
-v_rsq_f64 v[1:2], v[2:3]
-
-// SICI: v_rsq_clamp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x65,0x02,0x7e]
-// NOVI: error: instruction not supported on this GPU
-// NOVI: v_rsq_clamp_f64 v[1:2], v[2:3]
-v_rsq_clamp_f64 v[1:2], v[2:3]
-
-// SICI: v_sqrt_f32_e32 v1, v2 ; encoding: [0x02,0x67,0x02,0x7e]
-// VI:   v_sqrt_f32_e32 v1, v2 ; encoding: [0x02,0x4f,0x02,0x7e]
-v_sqrt_f32 v1, v2
-
-// SICI: v_sqrt_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x69,0x02,0x7e]
-// VI:   v_sqrt_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x51,0x02,0x7e]
-v_sqrt_f64 v[1:2], v[2:3]
-
-// SICI: v_sin_f32_e32 v1, v2 ; encoding: [0x02,0x6b,0x02,0x7e]
-// VI:   v_sin_f32_e32 v1, v2 ; encoding: [0x02,0x53,0x02,0x7e]
-v_sin_f32 v1, v2
-
-// SICI: v_cos_f32_e32 v1, v2 ; encoding: [0x02,0x6d,0x02,0x7e]
-// VI:   v_cos_f32_e32 v1, v2 ; encoding: [0x02,0x55,0x02,0x7e]
-v_cos_f32 v1, v2
-
-// SICI: v_not_b32_e32 v1, v2 ; encoding: [0x02,0x6f,0x02,0x7e]
-// VI:   v_not_b32_e32 v1, v2 ; encoding: [0x02,0x57,0x02,0x7e]
-v_not_b32 v1, v2
-
-// SICI: v_bfrev_b32_e32 v1, v2 ; encoding: [0x02,0x71,0x02,0x7e]
-// VI:   v_bfrev_b32_e32 v1, v2 ; encoding: [0x02,0x59,0x02,0x7e]
-v_bfrev_b32 v1, v2
-
-// SICI: v_ffbh_u32_e32 v1, v2 ; encoding: [0x02,0x73,0x02,0x7e]
-// VI:   v_ffbh_u32_e32 v1, v2 ; encoding: [0x02,0x5b,0x02,0x7e]
-v_ffbh_u32 v1, v2
-
-// SICI: v_ffbl_b32_e32 v1, v2 ; encoding: [0x02,0x75,0x02,0x7e]
-// VI:   v_ffbl_b32_e32 v1, v2 ; encoding: [0x02,0x5d,0x02,0x7e]
-v_ffbl_b32 v1, v2
-
-// SICI: v_ffbh_i32_e32 v1, v2 ; encoding: [0x02,0x77,0x02,0x7e]
-// VI:   v_ffbh_i32_e32 v1, v2 ; encoding: [0x02,0x5f,0x02,0x7e]
-v_ffbh_i32_e32 v1, v2
-
-// SICI: v_frexp_exp_i32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x79,0x02,0x7e]
-// VI:   v_frexp_exp_i32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x61,0x02,0x7e]
-v_frexp_exp_i32_f64 v1, v[2:3]
-
-// SICI: v_frexp_mant_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x7b,0x02,0x7e]
-// VI;   v_frexp_mant_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x63,0x02,0x7e]
-v_frexp_mant_f64 v[1:2], v[2:3]
-
-// SICI: v_fract_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x7d,0x02,0x7e]
-// VI:   v_fract_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x65,0x02,0x7e]
-v_fract_f64 v[1:2], v[2:3]
-
-// SICI: v_frexp_exp_i32_f32_e32 v1, v2 ; encoding: [0x02,0x7f,0x02,0x7e]
-// VI:   v_frexp_exp_i32_f32_e32 v1, v2 ; encoding: [0x02,0x67,0x02,0x7e]
-v_frexp_exp_i32_f32 v1, v2
-
-// SICI: v_frexp_mant_f32_e32 v1, v2 ; encoding: [0x02,0x81,0x02,0x7e]
-// VI:   v_frexp_mant_f32_e32 v1, v2 ; encoding: [0x02,0x69,0x02,0x7e]
-v_frexp_mant_f32 v1, v2
-
-// SICI: v_clrexcp ; encoding: [0x00,0x82,0x00,0x7e]
-// VI:   v_clrexcp ; encoding: [0x00,0x6a,0x00,0x7e]
-v_clrexcp
-
-// SICI: v_movreld_b32_e32 v1, v2 ; encoding: [0x02,0x85,0x02,0x7e]
-// VI:   v_movreld_b32_e32 v1, v2 ; encoding: [0x02,0x6d,0x02,0x7e]
-v_movreld_b32 v1, v2
-
-// SICI: v_movrels_b32_e32 v1, v2 ; encoding: [0x02,0x87,0x02,0x7e]
-// VI:   v_movrels_b32_e32 v1, v2 ; encoding: [0x02,0x6f,0x02,0x7e]
-v_movrels_b32 v1, v2
-
-// SICI: v_movrelsd_b32_e32 v1, v2 ; encoding: [0x02,0x89,0x02,0x7e]
-// VI:   v_movrelsd_b32_e32 v1, v2 ; encoding: [0x02,0x71,0x02,0x7e]
-v_movrelsd_b32 v1, v2
-
-// NOSI: error: instruction not supported on this GPU
-// NOSI: v_log_legacy_f32 v1, v2
-// CI: v_log_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x8b,0x02,0x7e]
-// VI: v_log_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x99,0x02,0x7e]
-v_log_legacy_f32 v1, v2
-
-// NOSI: error: instruction not supported on this GPU
-// NOSI: v_exp_legacy_f32 v1, v2
-// CI: v_exp_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x8d,0x02,0x7e]
-// VI: v_exp_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x97,0x02,0x7e]
-v_exp_legacy_f32 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_cvt_f16_u16 v1, v2
-// VI: v_cvt_f16_u16_e32 v1, v2 ; encoding: [0x02,0x73,0x02,0x7e]
-v_cvt_f16_u16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_cvt_f16_i16 v1, v2
-// VI: v_cvt_f16_i16_e32 v1, v2 ; encoding: [0x02,0x75,0x02,0x7e]
-v_cvt_f16_i16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_cvt_u16_f16 v1, v2
-// VI: v_cvt_u16_f16_e32 v1, v2 ; encoding: [0x02,0x77,0x02,0x7e]
-v_cvt_u16_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_cvt_i16_f16 v1, v2
-// VI: v_cvt_i16_f16_e32 v1, v2 ; encoding: [0x02,0x79,0x02,0x7e]
-v_cvt_i16_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_rcp_f16 v1, v2
-// VI: v_rcp_f16_e32 v1, v2 ; encoding: [0x02,0x7b,0x02,0x7e]
-v_rcp_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_sqrt_f16 v1, v2
-// VI: v_sqrt_f16_e32 v1, v2 ; encoding: [0x02,0x7d,0x02,0x7e]
-v_sqrt_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_rsq_f16 v1, v2
-// VI: v_rsq_f16_e32 v1, v2 ; encoding: [0x02,0x7f,0x02,0x7e]
-v_rsq_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_log_f16 v1, v2
-// VI: v_log_f16_e32 v1, v2 ; encoding: [0x02,0x81,0x02,0x7e]
-v_log_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_exp_f16 v1, v2
-// VI: v_exp_f16_e32 v1, v2 ; encoding: [0x02,0x83,0x02,0x7e]
-v_exp_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_frexp_mant_f16 v1, v2
-// VI: v_frexp_mant_f16_e32 v1, v2 ; encoding: [0x02,0x85,0x02,0x7e]
-v_frexp_mant_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_frexp_exp_i16_f16 v1, v2
-// VI: v_frexp_exp_i16_f16_e32 v1, v2 ; encoding: [0x02,0x87,0x02,0x7e]
-v_frexp_exp_i16_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_floor_f16 v1, v2
-// VI: v_floor_f16_e32 v1, v2 ; encoding: [0x02,0x89,0x02,0x7e]
-v_floor_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_ceil_f16 v1, v2
-// VI: v_ceil_f16_e32 v1, v2 ; encoding: [0x02,0x8b,0x02,0x7e]
-v_ceil_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_trunc_f16 v1, v2
-// VI: v_trunc_f16_e32 v1, v2 ; encoding: [0x02,0x8d,0x02,0x7e]
-v_trunc_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_rndne_f16 v1, v2
-// VI: v_rndne_f16_e32 v1, v2 ; encoding: [0x02,0x8f,0x02,0x7e]
-v_rndne_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_fract_f16 v1, v2
-// VI: v_fract_f16_e32 v1, v2 ; encoding: [0x02,0x91,0x02,0x7e]
-v_fract_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_sin_f16 v1, v2
-// VI: v_sin_f16_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
-v_sin_f16 v1, v2
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_cos_f16 v1, v2
-// VI: v_cos_f16_e32 v1, v2 ; encoding: [0x02,0x95,0x02,0x7e]
-v_cos_f16 v1, v2
diff --git a/test/MC/R600/vop2-err.s b/test/MC/R600/vop2-err.s
deleted file mode 100644
index a1131000a90..00000000000
--- a/test/MC/R600/vop2-err.s
+++ /dev/null
@@ -1,35 +0,0 @@
-// RUN: not llvm-mc -arch=amdgcn %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -arch=amdgcn -mcpu=SI %s 2>&1 | FileCheck %s
-
-//===----------------------------------------------------------------------===//
-// Generic checks
-//===----------------------------------------------------------------------===//
-
-v_mul_i32_i24 v1, v2, 100
-// CHECK: error: invalid operand for instruction
-
-//===----------------------------------------------------------------------===//
-// _e32 checks
-//===----------------------------------------------------------------------===//
-
-// Immediate src1
-v_mul_i32_i24_e32 v1, v2, 100
-// CHECK: error: invalid operand for instruction
-
-// sgpr src1
-v_mul_i32_i24_e32 v1, v2, s3
-// CHECK: error: invalid operand for instruction
-
-//===----------------------------------------------------------------------===//
-// _e64 checks
-//===----------------------------------------------------------------------===//
-
-// Immediate src0
-v_mul_i32_i24_e64 v1, 100, v3
-// CHECK: error: invalid operand for instruction
-
-// Immediate src1
-v_mul_i32_i24_e64 v1, v2, 100
-// CHECK: error: invalid operand for instruction
-
-// TODO: Constant bus restrictions
diff --git a/test/MC/R600/vop2.s b/test/MC/R600/vop2.s
deleted file mode 100644
index a1f3b8d8936..00000000000
--- a/test/MC/R600/vop2.s
+++ /dev/null
@@ -1,421 +0,0 @@
-// RUN: not llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI
-// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI
-// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI
-// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI
-
-// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI
-// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI
-// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI
-// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s -check-prefix=NOVI
-
-//===----------------------------------------------------------------------===//
-// Generic Checks for floating-point instructions (These have modifiers).
-//===----------------------------------------------------------------------===//
-
-// TODO: 64-bit encoding of instructions with modifiers
-
-// _e32 suffix
-// SICI: v_add_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x06]
-v_add_f32_e32 v1, v2, v3
-
-// src0 inline immediate
-// SICI: v_add_f32_e32 v1, 1.0, v3 ; encoding: [0xf2,0x06,0x02,0x06]
-v_add_f32 v1, 1.0, v3
-
-// src0 negative inline immediate
-// SICI: v_add_f32_e32 v1, -1.0, v3 ; encoding: [0xf3,0x06,0x02,0x06]
-v_add_f32 v1, -1.0, v3
-
-// src0 literal
-// SICI: v_add_f32_e32 v1, 0x42c80000, v3 ; encoding: [0xff,0x06,0x02,0x06,0x00,0x00,0xc8,0x42]
-v_add_f32 v1, 100.0, v3
-
-// src0 negative literal
-// SICI: v_add_f32_e32 v1, 0xc2c80000, v3 ; encoding: [0xff,0x06,0x02,0x06,0x00,0x00,0xc8,0xc2]
-v_add_f32 v1, -100.0, v3
-
-//===----------------------------------------------------------------------===//
-// Generic Checks for integer instructions (These don't have modifiers).
-//===----------------------------------------------------------------------===//
-
-// _e32 suffix
-// SICI: v_mul_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x12]
-v_mul_i32_i24_e32 v1, v2, v3
-
-// _e64 suffix
-// SICI: v_mul_i32_i24_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x07,0x02,0x00]
-v_mul_i32_i24_e64 v1, v2, v3
-
-// src0 inline
-// SICI: v_mul_i32_i24_e32 v1, 3, v3 ; encoding: [0x83,0x06,0x02,0x12]
-v_mul_i32_i24 v1, 3, v3
-
-// src0 negative inline
-// SICI: v_mul_i32_i24_e32 v1, -3, v3 ; encoding: [0xc3,0x06,0x02,0x12]
-v_mul_i32_i24 v1, -3, v3
-
-// src1 inline
-// SICI: v_mul_i32_i24_e64 v1, v2, 3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x07,0x01,0x00]
-v_mul_i32_i24 v1, v2, 3
-
-// src1 negative inline
-// SICI: v_mul_i32_i24_e64 v1, v2, -3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x87,0x01,0x00]
-v_mul_i32_i24 v1, v2, -3
-
-// src0 literal
-// SICI: v_mul_i32_i24_e32 v1, 0x64, v3 ; encoding: [0xff,0x06,0x02,0x12,0x64,0x00,0x00,0x00]
-v_mul_i32_i24 v1, 100, v3
-
-// src1 negative literal
-// SICI: v_mul_i32_i24_e32 v1, 0xffffff9c, v3 ; encoding: [0xff,0x06,0x02,0x12,0x9c,0xff,0xff,0xff]
-v_mul_i32_i24 v1, -100, v3
-
-//===----------------------------------------------------------------------===//
-// Checks for legal operands
-//===----------------------------------------------------------------------===//
-
-// src0 sgpr
-// SICI: v_mul_i32_i24_e32 v1, s2, v3 ; encoding: [0x02,0x06,0x02,0x12]
-v_mul_i32_i24 v1, s2, v3
-
-// src1 sgpr
-// SICI: v_mul_i32_i24_e64 v1, v2, s3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x07,0x00,0x00]
-v_mul_i32_i24 v1, v2, s3
-
-// src0, src1 same sgpr
-// SICI: v_mul_i32_i24_e64 v1, s2, s2 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x04,0x00,0x00]
-v_mul_i32_i24 v1, s2, s2
-
-// src0 sgpr, src1 inline
-// SICI: v_mul_i32_i24_e64 v1, s2, 3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x06,0x01,0x00]
-v_mul_i32_i24 v1, s2, 3
-
-// src0 inline src1 sgpr
-// SICI: v_mul_i32_i24_e64 v1, 3, s3 ; encoding: [0x01,0x00,0x12,0xd2,0x83,0x06,0x00,0x00]
-v_mul_i32_i24 v1, 3, s3
-
-//===----------------------------------------------------------------------===//
-// Instructions
-//===----------------------------------------------------------------------===//
-
-// GCN: v_cndmask_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x00]
-v_cndmask_b32 v1, v2, v3
-
-// SICI: v_readlane_b32 s1, v2, s3 ; encoding: [0x02,0x07,0x02,0x02]
-// VI:   v_readlane_b32 s1, v2, s3 ; encoding: [0x01,0x00,0x89,0xd2,0x02,0x07,0x00,0x00]
-v_readlane_b32 s1, v2, s3
-
-// SICI: v_writelane_b32 v1, s2, s3 ; encoding: [0x02,0x06,0x02,0x04]
-// VI:   v_writelane_b32 v1, s2, s3 ; encoding: [0x01,0x00,0x8a,0xd2,0x02,0x06,0x00,0x00]
-v_writelane_b32 v1, s2, s3
-
-// SICI: v_add_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x06]
-// VI:   v_add_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x02]
-v_add_f32 v1, v2, v3
-
-// SICI: v_sub_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x08]
-// VI:   v_sub_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x04]
-v_sub_f32 v1, v2, v3
-
-// SICI: v_subrev_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0a]
-// VI:   v_subrev_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x06]
-v_subrev_f32 v1, v2, v3
-
-// SICI: v_mac_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0c]
-// NOVI: error: instruction not supported on this GPU
-// NOVI: v_mac_legacy_f32 v1, v2, v3
-v_mac_legacy_f32 v1, v2, v3
-
-// SICI: v_mul_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0e]
-// VI:   v_mul_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x08]
-v_mul_legacy_f32_e32 v1, v2, v3
-
-// SICI: v_mul_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x10]
-// VI:   v_mul_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0a]
-v_mul_f32 v1, v2, v3
-
-// SICI: v_mul_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x12]
-// VI:   v_mul_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0c]
-v_mul_i32_i24 v1, v2, v3
-
-// SICI: v_mul_hi_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x14]
-// VI:   v_mul_hi_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0e]
-v_mul_hi_i32_i24 v1, v2, v3
-
-// SICI: v_mul_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x16]
-// VI:   v_mul_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x10]
-v_mul_u32_u24 v1, v2, v3
-
-// SICI: v_mul_hi_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x18]
-// VI:   v_mul_hi_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x12]
-v_mul_hi_u32_u24 v1, v2, v3
-
-// SICI: v_min_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1a]
-// NOVI: error: instruction not supported on this GPU
-// NOVI: v_min_legacy_f32_e32 v1, v2, v3
-v_min_legacy_f32_e32 v1, v2, v3
-
-// SICI: v_max_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1c]
-// NOVI: error: instruction not supported on this GPU
-// NOVI: v_max_legacy_f32 v1, v2, v3
-v_max_legacy_f32 v1, v2, v3
-
-// SICI: v_min_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1e]
-// VI:   v_min_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x14]
-v_min_f32_e32 v1, v2, v3
-
-// SICI: v_max_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x20]
-// VI:   v_max_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x16]
-v_max_f32 v1, v2 v3
-
-// SICI: v_min_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x22]
-// VI:   v_min_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x18]
-v_min_i32 v1, v2, v3
-
-// SICI: v_max_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x24]
-// VI:   v_max_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1a]
-v_max_i32 v1, v2, v3
-
-// SICI: v_min_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x26]
-// VI:   v_min_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1c]
-v_min_u32 v1, v2, v3
-
-// SICI: v_max_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x28]
-// VI:   v_max_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1e]
-v_max_u32 v1, v2, v3
-
-// SICI: v_lshr_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2a]
-// NOVI: error: instruction not supported on this GPU
-// NOVI: v_lshr_b32 v1, v2, v3
-v_lshr_b32 v1, v2, v3
-
-// SICI: v_lshrrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2c]
-// VI:   v_lshrrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x20]
-v_lshrrev_b32 v1, v2, v3
-
-// SICI: v_ashr_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2e]
-// NOVI: error: instruction not supported on this GPU
-// NOVI: v_ashr_i32 v1, v2, v3
-v_ashr_i32 v1, v2, v3
-
-// SICI: v_ashrrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x30]
-// VI:   v_ashrrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x22]
-v_ashrrev_i32 v1, v2, v3
-
-// SICI: v_lshl_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x32]
-// NOVI: error: instruction not supported on this GPU
-// NOVI: v_lshl_b32_e32 v1, v2, v3
-v_lshl_b32_e32 v1, v2, v3
-
-// SICI: v_lshlrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x34]
-// VI:   v_lshlrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x24]
-v_lshlrev_b32 v1, v2, v3
-
-// SICI: v_and_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x36]
-// VI:   v_and_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x26]
-v_and_b32 v1, v2, v3
-
-// SICI: v_or_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x38]
-// VI:   v_or_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x28]
-v_or_b32 v1, v2, v3
-
-// SICI: v_xor_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3a]
-// VI:   v_xor_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2a]
-v_xor_b32 v1, v2, v3
-
-// SICI: v_bfm_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3c]
-// VI:   v_bfm_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
-v_bfm_b32 v1, v2, v3
-
-// SICI: v_mac_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3e]
-// VI:   v_mac_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2c]
-v_mac_f32 v1, v2, v3
-
-// SICI: v_madmk_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x40,0x00,0x00,0x80,0x42]
-// VI:   v_madmk_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x2e,0x00,0x00,0x80,0x42]
-v_madmk_f32 v1, v2, v3, 64.0
-
-// SICI: v_madak_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x42,0x00,0x00,0x80,0x42]
-// VI:  v_madak_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x30,0x00,0x00,0x80,0x42]
-v_madak_f32 v1, v2, v3, 64.0
-
-// SICI: v_bcnt_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x44]
-// VI:   v_bcnt_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00]
-v_bcnt_u32_b32 v1, v2, v3
-
-// SICI: v_mbcnt_lo_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x46]
-// VI:   v_mbcnt_lo_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00]
-v_mbcnt_lo_u32_b32 v1, v2, v3
-
-// SICI: v_mbcnt_hi_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x48]
-// VI:   v_mbcnt_hi_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00]
-v_mbcnt_hi_u32_b32 v1, v2, v3
-
-// SICI: v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4a]
-// VI:   v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x32]
-v_add_i32 v1, v2, v3
-
-// SICI: v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4a]
-// VI:   v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x32]
-v_add_u32 v1, v2, v3
-
-// SICI: v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c]
-// VI:   v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x34]
-v_sub_i32 v1, v2, v3
-
-// SICI: v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c]
-// VI:   v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x34]
-v_sub_u32 v1, v2, v3
-
-// SICI: v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4e]
-// VI:   v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x36]
-v_subrev_i32 v1, v2, v3
-
-// SICI: v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4e]
-// VI:   v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x36]
-v_subrev_u32 v1, v2, v3
-
-// SICI: v_addc_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x50]
-// VI:   v_addc_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x38]
-v_addc_u32 v1, v2, v3
-
-// SICI: v_subb_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x52]
-// VI:   v_subb_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3a]
-v_subb_u32 v1, v2, v3
-
-// SICI: v_subbrev_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x54]
-// VI:   v_subbrev_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3c]
-v_subbrev_u32 v1, v2, v3
-
-// SICI: v_ldexp_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x56]
-// VI:   v_ldexp_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x88,0xd2,0x02,0x07,0x02,0x00]
-v_ldexp_f32 v1, v2, v3
-
-// SICI: v_cvt_pkaccum_u8_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x58]
-// VI:   v_cvt_pkaccum_u8_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0xf0,0xd1,0x02,0x07,0x02,0x00]
-v_cvt_pkaccum_u8_f32 v1, v2, v3
-
-// SICI: v_cvt_pknorm_i16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5a]
-// VI:   v_cvt_pknorm_i16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x94,0xd2,0x02,0x07,0x02,0x00]
-v_cvt_pknorm_i16_f32 v1, v2, v3
-
-// SICI: v_cvt_pknorm_u16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5c]
-// VI:   v_cvt_pknorm_u16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x95,0xd2,0x02,0x07,0x02,0x00]
-v_cvt_pknorm_u16_f32 v1, v2, v3
-
-// SICI: v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5e]
-// VI:   v_cvt_pkrtz_f16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x96,0xd2,0x02,0x07,0x02,0x00]
-v_cvt_pkrtz_f16_f32 v1, v2, v3
-
-// SICI: v_cvt_pk_u16_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x60]
-// VI:   v_cvt_pk_u16_u32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00]
-v_cvt_pk_u16_u32 v1, v2, v3
-
-// SICI: v_cvt_pk_i16_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x62]
-// VI:   v_cvt_pk_i16_i32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00]
-v_cvt_pk_i16_i32 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_add_f16 v1, v2, v3
-// VI:     v_add_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3e]
-v_add_f16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_sub_f16 v1, v2, v3
-// VI:     v_sub_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x40]
-v_sub_f16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_subrev_f16 v1, v2, v3
-// VI:     v_subrev_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x42]
-v_subrev_f16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_mul_f16 v1, v2, v3
-// VI:     v_mul_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x44]
-v_mul_f16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_mac_f16 v1, v2, v3
-// VI:     v_mac_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x46]
-v_mac_f16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_madmk_f16 v1, v2, v3, 64.0
-// VI:     v_madmk_f16_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x00,0x80,0x42]
-v_madmk_f16 v1, v2, v3, 64.0
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_madak_f16 v1, v2, v3, 64.0
-// VI:     v_madak_f16_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x00,0x80,0x42]
-v_madak_f16 v1, v2, v3, 64.0
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_add_u16 v1, v2, v3
-// VI:     v_add_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c]
-v_add_u16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_sub_u16 v1, v2, v3
-// VI:     v_sub_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4e]
-v_sub_u16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_subrev_u16 v1, v2, v3
-// VI:     v_subrev_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x50]
-v_subrev_u16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_mul_lo_u16 v1, v2, v3
-// VI:     v_mul_lo_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x52]
-v_mul_lo_u16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_lshlrev_b16 v1, v2, v3
-// VI:     v_lshlrev_b16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x54]
-v_lshlrev_b16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_lshrrev_b16 v1, v2, v3
-// VI: v_lshrrev_b16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x56]
-v_lshrrev_b16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_ashrrev_b16 v1, v2, v3
-// VI:     v_ashrrev_b16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x58]
-v_ashrrev_b16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_max_f16 v1, v2, v3
-// VI:     v_max_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5a]
-v_max_f16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_min_f16 v1, v2, v3
-// VI:     v_min_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5c]
-v_min_f16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_max_u16 v1, v2, v3
-// VI:     v_max_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5e]
-v_max_u16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_max_i16 v1, v2, v3
-// VI:     v_max_i16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x60]
-v_max_i16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_min_u16 v1, v2, v3
-// VI:     v_min_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x62]
-v_min_u16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_min_i16 v1, v2, v3
-// VI:     v_min_i16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x64]
-v_min_i16 v1, v2, v3
-
-// NOSICI: error: instruction not supported on this GPU
-// NOSICI: v_ldexp_f16 v1, v2, v3
-// VI:     v_ldexp_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x66]
-v_ldexp_f16 v1, v2, v3
diff --git a/test/MC/R600/vop3-errs.s b/test/MC/R600/vop3-errs.s
deleted file mode 100644
index b57fe6d5314..00000000000
--- a/test/MC/R600/vop3-errs.s
+++ /dev/null
@@ -1,5 +0,0 @@
-// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s
-
-v_add_f32_e64 v0, v1
-// CHECK: error: too few operands for instruction
diff --git a/test/MC/R600/vop3.s b/test/MC/R600/vop3.s
deleted file mode 100644
index 20562335974..00000000000
--- a/test/MC/R600/vop3.s
+++ /dev/null
@@ -1,149 +0,0 @@
-// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
-// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
-
-//===----------------------------------------------------------------------===//
-// VOPC Instructions
-//===----------------------------------------------------------------------===//
-
-// Test forced e64 encoding
-
-v_cmp_lt_f32_e64 s[2:3], v4, -v6
-// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, -v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x40]
-
-//
-// Modifier tests:
-//
-
-v_cmp_lt_f32 s[2:3] -v4, v6
-// CHECK: v_cmp_lt_f32_e64 s[2:3], -v4, v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x20] 
-
-v_cmp_lt_f32 s[2:3]  v4, -v6
-// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, -v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x40]
-
-v_cmp_lt_f32 s[2:3] -v4, -v6
-// CHECK: v_cmp_lt_f32_e64 s[2:3], -v4, -v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x60]
-
-v_cmp_lt_f32 s[2:3] |v4|, v6
-// CHECK: v_cmp_lt_f32_e64 s[2:3], |v4|, v6 ; encoding: [0x02,0x01,0x02,0xd0,0x04,0x0d,0x02,0x00]
-
-v_cmp_lt_f32 s[2:3] v4, |v6|
-// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, |v6| ; encoding: [0x02,0x02,0x02,0xd0,0x04,0x0d,0x02,0x00]
-
-v_cmp_lt_f32 s[2:3] |v4|, |v6|
-// CHECK: v_cmp_lt_f32_e64 s[2:3], |v4|, |v6| ; encoding: [0x02,0x03,0x02,0xd0,0x04,0x0d,0x02,0x00]
-
-v_cmp_lt_f32 s[2:3] -|v4|, v6
-// CHECK: v_cmp_lt_f32_e64 s[2:3], -|v4|, v6 ; encoding: [0x02,0x01,0x02,0xd0,0x04,0x0d,0x02,0x20]
-
-v_cmp_lt_f32 s[2:3] v4, -|v6|
-// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, -|v6| ; encoding: [0x02,0x02,0x02,0xd0,0x04,0x0d,0x02,0x40]
-
-v_cmp_lt_f32 s[2:3] -|v4|, -|v6|
-// CHECK: v_cmp_lt_f32_e64 s[2:3], -|v4|, -|v6| ; encoding: [0x02,0x03,0x02,0xd0,0x04,0x0d,0x02,0x60]
-
-//
-// Instruction tests:
-//
-
-v_cmp_f_f32 s[2:3], v4, v6
-// CHECK: v_cmp_f_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x00,0xd0,0x04,0x0d,0x02,0x00]
-
-v_cmp_lt_f32 s[2:3], v4, v6
-// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x00]
-
-v_cmp_eq_f32 s[2:3], v4, v6
-// CHECK: v_cmp_eq_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x04,0xd0,0x04,0x0d,0x02,0x00]
-
-v_cmp_le_f32 s[2:3], v4, v6
-// CHECK: v_cmp_le_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x06,0xd0,0x04,0x0d,0x02,0x00]
-
-v_cmp_gt_f32 s[2:3], v4, v6
-// CHECK: v_cmp_gt_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x08,0xd0,0x04,0x0d,0x02,0x00]
-
-v_cmp_lg_f32 s[2:3], v4, v6
-// CHECK: v_cmp_lg_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x0a,0xd0,0x04,0x0d,0x02,0x00]
-
-v_cmp_ge_f32 s[2:3], v4, v6
-// CHECK: v_cmp_ge_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x0c,0xd0,0x04,0x0d,0x02,0x00]
-
-// TODO: Finish VOPC
-
-//===----------------------------------------------------------------------===//
-// VOP1 Instructions
-//===----------------------------------------------------------------------===//
-
-//
-// Modifier tests:
-// 
-
-v_fract_f32 v1, -v2
-// CHECK: v_fract_f32_e64 v1, -v2 ; encoding: [0x01,0x00,0x40,0xd3,0x02,0x01,0x00,0x20]
-
-v_fract_f32 v1, |v2|
-// CHECK: v_fract_f32_e64 v1, |v2| ; encoding: [0x01,0x01,0x40,0xd3,0x02,0x01,0x00,0x00]
-
-v_fract_f32 v1, -|v2|
-// CHECK: v_fract_f32_e64 v1, -|v2| ; encoding: [0x01,0x01,0x40,0xd3,0x02,0x01,0x00,0x20]
-
-v_fract_f32 v1, v2 clamp
-// CHECK: v_fract_f32_e64 v1, v2 clamp ; encoding: [0x01,0x08,0x40,0xd3,0x02,0x01,0x00,0x00]
-
-v_fract_f32 v1, v2 mul:2
-// CHECK: v_fract_f32_e64 v1, v2 mul:2 ; encoding: [0x01,0x00,0x40,0xd3,0x02,0x01,0x00,0x08]
-
-v_fract_f32 v1, v2, div:2 clamp
-// CHECK: v_fract_f32_e64 v1, v2 clamp div:2 ; encoding: [0x01,0x08,0x40,0xd3,0x02,0x01,0x00,0x18]
-
-// TODO: Finish VOP1
-
-///===---------------------------------------------------------------------===//
-// VOP2 Instructions
-///===---------------------------------------------------------------------===//
-
-// Test forced e64 encoding with e32 operands
-
-v_ldexp_f32_e64 v1, v3, v5
-// CHECK: v_ldexp_f32_e64 v1, v3, v5 ; encoding: [0x01,0x00,0x56,0xd2,0x03,0x0b,0x02,0x00]
-
-
-// TODO: Modifier tests
-
-v_cndmask_b32 v1, v3, v5, s[4:5]
-// CHECK: v_cndmask_b32_e64 v1, v3, v5, s[4:5] ; encoding: [0x01,0x00,0x00,0xd2,0x03,0x0b,0x12,0x00]
-
-//TODO: readlane, writelane
-
-v_add_f32 v1, v3, s5
-// CHECK: v_add_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x06,0xd2,0x03,0x0b,0x00,0x00]
-
-v_sub_f32 v1, v3, s5
-// CHECK: v_sub_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x08,0xd2,0x03,0x0b,0x00,0x00]
-
-v_subrev_f32 v1, v3, s5
-// CHECK: v_subrev_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x0a,0xd2,0x03,0x0b,0x00,0x00]
-
-v_mac_legacy_f32 v1, v3, s5
-// CHECK: v_mac_legacy_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x0c,0xd2,0x03,0x0b,0x00,0x00]
-
-v_mul_legacy_f32 v1, v3, s5
-// CHECK: v_mul_legacy_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x0e,0xd2,0x03,0x0b,0x00,0x00]
-
-v_mul_f32 v1, v3, s5
-// CHECK: v_mul_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x10,0xd2,0x03,0x0b,0x00,0x00]
-
-v_mul_i32_i24 v1, v3, s5
-// CHECK: v_mul_i32_i24_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x12,0xd2,0x03,0x0b,0x00,0x00]
-
-///===---------------------------------------------------------------------===//
-// VOP3 Instructions
-///===---------------------------------------------------------------------===//
-
-// TODO: Modifier tests
-
-v_mad_legacy_f32 v2, v4, v6, v8
-// CHECK: v_mad_legacy_f32 v2, v4, v6, v8 ; encoding: [0x02,0x00,0x80,0xd2,0x04,0x0d,0x22,0x04]
-
-
-
-
-
diff --git a/test/MC/R600/vopc.s b/test/MC/R600/vopc.s
deleted file mode 100644
index f44919a4f1e..00000000000
--- a/test/MC/R600/vopc.s
+++ /dev/null
@@ -1,40 +0,0 @@
-// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
-// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
-
-//===----------------------------------------------------------------------===//
-// Generic Checks
-//===----------------------------------------------------------------------===//
-
-// src0 sgpr
-v_cmp_lt_f32 vcc, s2, v4
-// CHECK: v_cmp_lt_f32_e32 vcc, s2, v4 ; encoding: [0x02,0x08,0x02,0x7c]
-
-// src0 inline immediate
-v_cmp_lt_f32 vcc, 0, v4
-// CHECK: v_cmp_lt_f32_e32 vcc, 0, v4 ; encoding: [0x80,0x08,0x02,0x7c]
-
-// src0 literal
-v_cmp_lt_f32 vcc, 10.0, v4
-// CHECK: v_cmp_lt_f32_e32 vcc, 0x41200000, v4 ; encoding: [0xff,0x08,0x02,0x7c,0x00,0x00,0x20,0x41]
-
-// src0, src1 max vgpr
-v_cmp_lt_f32 vcc, v255, v255
-// CHECK: v_cmp_lt_f32_e32 vcc, v255, v255 ; encoding: [0xff,0xff,0x03,0x7c]
-
-// force 32-bit encoding
-v_cmp_lt_f32_e32 vcc, v2, v4
-// CHECK: v_cmp_lt_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x02,0x7c]
-
-
-//===----------------------------------------------------------------------===//
-// Instructions
-//===----------------------------------------------------------------------===//
-
-v_cmp_f_f32 vcc, v2, v4
-// CHECK: v_cmp_f_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x00,0x7c]
-
-v_cmp_lt_f32 vcc, v2, v4
-// CHECK: v_cmp_lt_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x02,0x7c]
-
-// TODO: Add tests for the rest of the instructions.
-